Series comparison

-[PULL 00/52] tcg patch queue
+[PULL 00/72] tcg patch queue
-The following changes since commit b52daaf2c868f2bab102eb5acbf55b2917f46aea:
+The following changes since commit aa3a285b5bc56a4208b3b57d4a55291e9c260107:
-  Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging (2023-06-05 10:27:31 -0700)
+  Merge tag 'mem-2024-12-21' of https://github.com/davidhildenbrand/qemu into staging (2024-12-22 14:33:27 -0500)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230605
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241224
-for you to fetch changes up to a7f6911c127b1dd1b8764e03b0ebcf0a227a15e4:
+for you to fetch changes up to e4a8e093dc74be049f4829831dce76e5edab0003:
-  tcg/tcg-op-vec: Remove left over _link_error() definitions (2023-06-05 12:20:16 -0700)
+  accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core (2024-12-24 08:32:15 -0800)
 ----------------------------------------------------------------
-Build tcg/ once for system and once for user.
+tcg/optimize: Remove in-flight mask data from OptContext
-Unmap perf_marker.
+fpu: Add float*_muladd_scalbn
-Remove left over _link_error() definitions.
+fpu: Remove float_muladd_halve_result
 fpu: Add float_round_nearest_even_max
 fpu: Add float_muladd_suppress_add_product_zero
 target/hexagon: Use float32_muladd
 accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
 ----------------------------------------------------------------
 Ilya Leoshkevich (1):
-      accel/tcg: Unmap perf_marker
+      tests/tcg: Do not use inttypes.h in multiarch/system/memory.c
-Philippe Mathieu-Daudé (2):
+Pierrick Bouvier (1):
-      target/ppc: Inline gen_icount_io_start()
+      plugins: optimize cpu_index code generation
       tcg/tcg-op-vec: Remove left over _link_error() definitions
-Richard Henderson (49):
+Richard Henderson (70):
-      tcg/ppc: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+      tcg/optimize: Split out finish_bb, finish_ebb
-      tcg/riscv: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+      tcg/optimize: Split out fold_affected_mask
-      tcg/s390x: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+      tcg/optimize: Copy mask writeback to fold_masks
-      tcg/sparc64: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+      tcg/optimize: Split out fold_masks_zs
-      tcg: Move TCG_TYPE_TL from tcg.h to tcg-op.h
+      tcg/optimize: Augment s_mask from z_mask in fold_masks_zs
-      tcg: Widen CPUTLBEntry comparators to 64-bits
+      tcg/optimize: Change representation of s_mask
-      tcg: Add tlb_fast_offset to TCGContext
+      tcg/optimize: Use finish_folding in fold_add, fold_add_vec, fold_addsub2
-      target/avr: Add missing includes of qemu/error-report.h
+      tcg/optimize: Introduce const value accessors for TempOptInfo
-      target/*: Add missing includes of tcg/debug-assert.h
+      tcg/optimize: Use fold_masks_zs in fold_and
-      *: Add missing includes of tcg/tcg.h
+      tcg/optimize: Use fold_masks_zs in fold_andc
-      tcg: Split out tcg-target-reg-bits.h
+      tcg/optimize: Use fold_masks_zs in fold_bswap
-      target/arm: Fix test of TCG_OVERSIZED_GUEST
+      tcg/optimize: Use fold_masks_zs in fold_count_zeros
-      tcg: Split out tcg/oversized-guest.h
+      tcg/optimize: Use fold_masks_z in fold_ctpop
-      tcg: Move TCGv, dup_const_tl definitions to tcg-op.h
+      tcg/optimize: Use fold_and and fold_masks_z in fold_deposit
-      tcg: Split tcg/tcg-op-common.h from tcg/tcg-op.h
+      tcg/optimize: Compute sign mask in fold_deposit
-      target/arm: Include helper-gen.h in translator.h
+      tcg/optimize: Use finish_folding in fold_divide
-      target/hexagon: Include helper-gen.h where needed
+      tcg/optimize: Use finish_folding in fold_dup, fold_dup2
-      tcg: Remove outdated comments in helper-head.h
+      tcg/optimize: Use fold_masks_s in fold_eqv
-      tcg: Move TCGHelperInfo and dependencies to tcg/helper-info.h
+      tcg/optimize: Use fold_masks_z in fold_extract
-      tcg: Pass TCGHelperInfo to tcg_gen_callN
+      tcg/optimize: Use finish_folding in fold_extract2
-      tcg: Move temp_idx and tcgv_i32_temp debug out of line
+      tcg/optimize: Use fold_masks_zs in fold_exts
-      tcg: Split tcg_gen_callN
+      tcg/optimize: Use fold_masks_z in fold_extu
-      tcg: Split helper-gen.h
+      tcg/optimize: Use fold_masks_zs in fold_movcond
-      tcg: Split helper-proto.h
+      tcg/optimize: Use finish_folding in fold_mul*
-      target/sh4: Emit insn_start for each insn in gUSA region
+      tcg/optimize: Use fold_masks_s in fold_nand
-      tcg: Add insn_start_words to TCGContext
+      tcg/optimize: Use fold_masks_z in fold_neg_no_const
-      tcg: Add guest_mo to TCGContext
+      tcg/optimize: Use fold_masks_s in fold_nor
-      tcg: Move TLB_FLAGS_MASK check out of get_alignment_bits
+      tcg/optimize: Use fold_masks_s in fold_not
-      tcg: Split tcg/tcg-op-gvec.h
+      tcg/optimize: Use fold_masks_zs in fold_or
-      tcg: Remove NO_CPU_IO_DEFS
+      tcg/optimize: Use fold_masks_zs in fold_orc
-      exec-all: Widen tb_page_addr_t for user-only
+      tcg/optimize: Use fold_masks_zs in fold_qemu_ld
-      exec-all: Widen TranslationBlock pc and cs_base to 64-bits
+      tcg/optimize: Return true from fold_qemu_st, fold_tcg_st
-      tcg: Spit out exec/translation-block.h
+      tcg/optimize: Use finish_folding in fold_remainder
-      include/exec: Remove CODE_GEN_AVG_BLOCK_SIZE
+      tcg/optimize: Distinguish simplification in fold_setcond_zmask
-      accel/tcg: Move most of gen-icount.h into translator.c
+      tcg/optimize: Use fold_masks_z in fold_setcond
-      accel/tcg: Introduce translator_io_start
+      tcg/optimize: Use fold_masks_s in fold_negsetcond
-      accel/tcg: Move translator_fake_ldb out of line
+      tcg/optimize: Use fold_masks_z in fold_setcond2
-      target/arm: Tidy helpers for translation
+      tcg/optimize: Use finish_folding in fold_cmp_vec
-      target/mips: Tidy helpers for translation
+      tcg/optimize: Use finish_folding in fold_cmpsel_vec
-      target/*: Add missing includes of exec/translation-block.h
+      tcg/optimize: Use fold_masks_zs in fold_sextract
-      target/arm: Add missing include of exec/exec-all.h
+      tcg/optimize: Use fold_masks_zs, fold_masks_s in fold_shift
-      accel/tcg: Tidy includes for translator.[ch]
+      tcg/optimize: Simplify sign bit test in fold_shift
-      tcg: Fix PAGE/PROT confusion
+      tcg/optimize: Use finish_folding in fold_sub, fold_sub_vec
-      tcg: Move env defines out of NEED_CPU_H in helper-head.h
+      tcg/optimize: Use fold_masks_zs in fold_tcg_ld
-      tcg: Remove target-specific headers from tcg.[ch]
+      tcg/optimize: Use finish_folding in fold_tcg_ld_memcopy
-      plugins: Move plugin_insn_append to translator.c
+      tcg/optimize: Use fold_masks_zs in fold_xor
-      plugins: Drop unused headers from exec/plugin-gen.h
+      tcg/optimize: Use finish_folding in fold_bitsel_vec
-      exec/poison: Do not poison CONFIG_SOFTMMU
+      tcg/optimize: Use finish_folding as default in tcg_optimize
-      tcg: Build once for system and once for user-only
+      tcg/optimize: Remove z_mask, s_mask from OptContext
       tcg/optimize: Re-enable sign-mask optimizations
       tcg/optimize: Move fold_bitsel_vec into alphabetic sort
       tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
       softfloat: Add float{16,32,64}_muladd_scalbn
       target/arm: Use float*_muladd_scalbn
       target/sparc: Use float*_muladd_scalbn
       softfloat: Remove float_muladd_halve_result
       softfloat: Add float_round_nearest_even_max
       softfloat: Add float_muladd_suppress_add_product_zero
       target/hexagon: Use float32_mul in helper_sfmpy
       target/hexagon: Use float32_muladd for helper_sffma
       target/hexagon: Use float32_muladd for helper_sffms
       target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
       target/hexagon: Use float32_muladd for helper_sffm[as]_lib
       target/hexagon: Remove internal_fmafx
       target/hexagon: Expand GEN_XF_ROUND
       target/hexagon: Remove Float
       target/hexagon: Remove Double
       target/hexagon: Use mulu64 for int128_mul_6464
       target/hexagon: Simplify internal_mpyhh setup
       accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
- MAINTAINERS                                        |    3 +-
+ include/exec/translator.h           |  14 -
- include/exec/cpu-all.h                             |    3 +
+ include/fpu/softfloat-types.h       |   2 +
- include/exec/cpu-defs.h                            |   50 +-
+ include/fpu/softfloat.h             |  14 +-
- include/exec/cpu_ldst.h                            |   22 +-
+ include/hw/core/tcg-cpu-ops.h       |  13 +
- include/exec/exec-all.h                            |  142 +--
+ target/alpha/cpu.h                  |   2 +
- include/exec/gen-icount.h                          |   83 --
+ target/arm/internals.h              |   2 +
- include/exec/helper-gen-common.h                   |   18 +
+ target/avr/cpu.h                    |   2 +
- include/exec/helper-gen.h                          |   97 +-
+ target/hexagon/cpu.h                |   2 +
- include/exec/helper-head.h                         |   24 +-
+ target/hexagon/fma_emu.h            |   3 -
- include/exec/helper-proto-common.h                 |   18 +
+ target/hppa/cpu.h                   |   2 +
- include/exec/helper-proto.h                        |   73 +-
+ target/i386/tcg/helper-tcg.h        |   2 +
- include/exec/helper-tcg.h                          |   75 --
+ target/loongarch/internals.h        |   2 +
- include/exec/plugin-gen.h                          |   24 -
+ target/m68k/cpu.h                   |   2 +
- include/exec/poison.h                              |    1 -
+ target/microblaze/cpu.h             |   2 +
- include/exec/tlb-common.h                          |   56 ++
+ target/mips/tcg/tcg-internal.h      |   2 +
- include/exec/translation-block.h                   |  149 +++
+ target/openrisc/cpu.h               |   2 +
- include/exec/translator.h                          |   24 +-
+ target/ppc/cpu.h                    |   2 +
- include/qemu/typedefs.h                            |    1 +
+ target/riscv/cpu.h                  |   3 +
- include/tcg/helper-info.h                          |   64 ++
+ target/rx/cpu.h                     |   2 +
- include/tcg/insn-start-words.h                     |   17 +
+ target/s390x/s390x-internal.h       |   2 +
- include/tcg/oversized-guest.h                      |   23 +
+ target/sh4/cpu.h                    |   2 +
- include/tcg/tcg-op-common.h                        |  996 +++++++++++++++++++
+ target/sparc/cpu.h                  |   2 +
- include/tcg/tcg-op-gvec-common.h                   |  426 ++++++++
+ target/sparc/helper.h               |   4 +-
- include/tcg/tcg-op-gvec.h                          |  444 +--------
+ target/tricore/cpu.h                |   2 +
- include/tcg/tcg-op.h                               | 1033 +-------------------
+ target/xtensa/cpu.h                 |   2 +
- include/tcg/tcg-opc.h                              |    6 +-
+ accel/tcg/cpu-exec.c                |   8 +-
- include/tcg/tcg.h                                  |  107 +-
+ accel/tcg/plugin-gen.c              |   9 +
- target/arm/cpregs.h                                |    4 +-
+ accel/tcg/translate-all.c           |   8 +-
- target/arm/tcg/translate.h                         |    5 +
+ fpu/softfloat.c                     |  63 +--
- target/mips/tcg/translate.h                        |    5 +-
+ target/alpha/cpu.c                  |   1 +
- target/ppc/cpu.h                                   |    2 -
+ target/alpha/translate.c            |   4 +-
- target/sparc/cpu.h                                 |    2 -
+ target/arm/cpu.c                    |   1 +
- tcg/aarch64/tcg-target-reg-bits.h                  |   12 +
+ target/arm/tcg/cpu-v7m.c            |   1 +
- tcg/arm/tcg-target-reg-bits.h                      |   12 +
+ target/arm/tcg/helper-a64.c         |   6 +-
- tcg/i386/tcg-target-reg-bits.h                     |   16 +
+ target/arm/tcg/translate.c          |   5 +-
- tcg/i386/tcg-target.h                              |    2 -
+ target/avr/cpu.c                    |   1 +
- tcg/loongarch64/tcg-target-reg-bits.h              |   21 +
+ target/avr/translate.c              |   6 +-
- tcg/loongarch64/tcg-target.h                       |   11 -
+ target/hexagon/cpu.c                |   1 +
- tcg/mips/tcg-target-reg-bits.h                     |   18 +
+ target/hexagon/fma_emu.c            | 496 ++++++---------------
- tcg/mips/tcg-target.h                              |    8 -
+ target/hexagon/op_helper.c          | 125 ++----
- tcg/ppc/tcg-target-reg-bits.h                      |   16 +
+ target/hexagon/translate.c          |   4 +-
- tcg/ppc/tcg-target.h                               |    5 -
+ target/hppa/cpu.c                   |   1 +
- tcg/riscv/tcg-target-reg-bits.h                    |   19 +
+ target/hppa/translate.c             |   4 +-
- tcg/riscv/tcg-target.h                             |    9 -
+ target/i386/tcg/tcg-cpu.c           |   1 +
- tcg/s390x/tcg-target-reg-bits.h                    |   17 +
+ target/i386/tcg/translate.c         |   5 +-
- tcg/sparc64/tcg-target-reg-bits.h                  |   12 +
+ target/loongarch/cpu.c              |   1 +
- tcg/tcg-internal.h                                 |   47 +-
+ target/loongarch/tcg/translate.c    |   4 +-
- tcg/tci/tcg-target-reg-bits.h                      |   18 +
+ target/m68k/cpu.c                   |   1 +
- tcg/tci/tcg-target.h                               |    8 -
+ target/m68k/translate.c             |   4 +-
- include/exec/helper-gen.h.inc                      |  102 ++
+ target/microblaze/cpu.c             |   1 +
- include/exec/helper-proto.h.inc                    |   68 ++
+ target/microblaze/translate.c       |   4 +-
- accel/tcg/cpu-exec.c                               |    2 +-
+ target/mips/cpu.c                   |   1 +
- accel/tcg/cputlb.c                                 |   12 +-
+ target/mips/tcg/translate.c         |   4 +-
- accel/tcg/monitor.c                                |    1 +
+ target/openrisc/cpu.c               |   1 +
- accel/tcg/perf.c                                   |   19 +-
+ target/openrisc/translate.c         |   4 +-
- accel/tcg/plugin-gen.c                             |    6 +
+ target/ppc/cpu_init.c               |   1 +
- accel/tcg/tcg-accel-ops-mttcg.c                    |    2 +-
+ target/ppc/translate.c              |   4 +-
- accel/tcg/tcg-accel-ops-rr.c                       |    2 +-
+ target/riscv/tcg/tcg-cpu.c          |   1 +
- accel/tcg/tcg-all.c                                |    1 +
+ target/riscv/translate.c            |   4 +-
- accel/tcg/tcg-runtime-gvec.c                       |    2 +-
+ target/rx/cpu.c                     |   1 +
- accel/tcg/tcg-runtime.c                            |    6 +-
+ target/rx/translate.c               |   4 +-
- accel/tcg/translate-all.c                          |   30 +-
+ target/s390x/cpu.c                  |   1 +
- accel/tcg/translator.c                             |  140 ++-
+ target/s390x/tcg/translate.c        |   4 +-
- target/alpha/translate.c                           |   18 +-
+ target/sh4/cpu.c                    |   1 +
- target/arm/ptw.c                                   |    8 +-
+ target/sh4/translate.c              |   4 +-
- target/arm/tcg/translate-a64.c                     |   42 +-
+ target/sparc/cpu.c                  |   1 +
- target/arm/tcg/translate-m-nocp.c                  |    2 -
+ target/sparc/fop_helper.c           |   8 +-
- target/arm/tcg/translate-mve.c                     |    4 -
+ target/sparc/translate.c            |  84 ++--
- target/arm/tcg/translate-neon.c                    |    4 -
+ target/tricore/cpu.c                |   1 +
- target/arm/tcg/translate-sme.c                     |    7 -
+ target/tricore/translate.c          |   5 +-
- target/arm/tcg/translate-sve.c                     |   11 -
+ target/xtensa/cpu.c                 |   1 +
- target/arm/tcg/translate-vfp.c                     |    7 +-
+ target/xtensa/translate.c           |   4 +-
- target/arm/tcg/translate.c                         |   41 +-
+ tcg/optimize.c                      | 857 +++++++++++++++++++-----------------
- target/avr/cpu.c                                   |    1 +
+ tests/tcg/multiarch/system/memory.c |   9 +-
- target/avr/helper.c                                |    1 +
+ fpu/softfloat-parts.c.inc           |  16 +-
- target/avr/translate.c                             |    6 +-
+files changed, 866 insertions(+), 1009 deletions(-)
  target/cris/translate.c                            |    8 +-
  target/hexagon/genptr.c                            |    1 +
  target/hexagon/translate.c                         |    7 +
  target/hppa/translate.c                            |   10 +-
  target/i386/helper.c                               |    3 +
  target/i386/tcg/translate.c                        |   57 +-
  target/loongarch/translate.c                       |    7 +-
  target/m68k/translate.c                            |    5 +-
  target/microblaze/translate.c                      |    6 +-
  target/mips/tcg/msa_translate.c                    |    3 -
  target/mips/tcg/mxu_translate.c                    |    2 -
  target/mips/tcg/octeon_translate.c                 |    4 +-
  target/mips/tcg/rel6_translate.c                   |    2 -
  target/mips/tcg/translate.c                        |   53 +-
  target/mips/tcg/translate_addr_const.c             |    1 -
  target/mips/tcg/tx79_translate.c                   |    4 +-
  target/mips/tcg/vr54xx_translate.c                 |    3 -
  target/nios2/translate.c                           |    6 +-
  target/openrisc/sys_helper.c                       |    1 +
  target/openrisc/translate.c                        |   14 +-
  target/ppc/translate.c                             |   78 +-
  target/riscv/cpu_helper.c                          |    1 +
  target/riscv/translate.c                           |    6 +-
  target/rx/cpu.c                                    |    1 +
  target/rx/op_helper.c                              |    1 +
  target/rx/translate.c                              |    7 +-
  target/s390x/tcg/translate.c                       |   10 +-
  target/sh4/translate.c                             |   21 +-
  target/sparc/translate.c                           |   78 +-
  target/tricore/cpu.c                               |    1 +
  target/tricore/translate.c                         |    7 +-
  target/xtensa/translate.c                          |   31 +-
  tcg/optimize.c                                     |    2 +-
  tcg/region.c                                       |   20 +-
  tcg/tcg-op-gvec.c                                  |    4 +-
  tcg/tcg-op-ldst.c                                  |   26 +-
  tcg/tcg-op-vec.c                                   |   13 +-
  tcg/tcg-op.c                                       |    4 +-
  tcg/tcg.c                                          |  218 +++--
  tcg/tci.c                                          |    3 +-
  include/exec/helper-info.c.inc                     |   96 ++
  target/loongarch/insn_trans/trans_extra.c.inc      |    4 +-
  target/loongarch/insn_trans/trans_privileged.c.inc |    4 +-
  target/ppc/power8-pmu-regs.c.inc                   |   10 +-
  target/ppc/translate/branch-impl.c.inc             |    2 +-
  target/riscv/insn_trans/trans_privileged.c.inc     |    8 +-
  target/riscv/insn_trans/trans_rvi.c.inc            |   24 +-
  tcg/aarch64/tcg-target.c.inc                       |    8 +-
  tcg/arm/tcg-target.c.inc                           |    8 +-
  tcg/i386/tcg-target.c.inc                          |    9 +-
  tcg/loongarch64/tcg-target.c.inc                   |    8 +-
  tcg/mips/tcg-target.c.inc                          |   20 +-
  tcg/ppc/tcg-target.c.inc                           |   46 +-
  tcg/riscv/tcg-target.c.inc                         |   21 +-
  tcg/s390x/tcg-target.c.inc                         |   22 +-
  tcg/sparc64/tcg-target.c.inc                       |   20 +-
  scripts/make-config-poison.sh                      |    5 +-
  target/hexagon/idef-parser/idef-parser.y           |    3 +-
  tcg/meson.build                                    |   30 +-
 files changed, 3088 insertions(+), 2782 deletions(-)
  delete mode 100644 include/exec/gen-icount.h
  create mode 100644 include/exec/helper-gen-common.h
  create mode 100644 include/exec/helper-proto-common.h
  delete mode 100644 include/exec/helper-tcg.h
  create mode 100644 include/exec/tlb-common.h
  create mode 100644 include/exec/translation-block.h
  create mode 100644 include/tcg/helper-info.h
  create mode 100644 include/tcg/insn-start-words.h
  create mode 100644 include/tcg/oversized-guest.h
  create mode 100644 include/tcg/tcg-op-common.h
  create mode 100644 include/tcg/tcg-op-gvec-common.h
  create mode 100644 tcg/aarch64/tcg-target-reg-bits.h
  create mode 100644 tcg/arm/tcg-target-reg-bits.h
  create mode 100644 tcg/i386/tcg-target-reg-bits.h
  create mode 100644 tcg/loongarch64/tcg-target-reg-bits.h
  create mode 100644 tcg/mips/tcg-target-reg-bits.h
  create mode 100644 tcg/ppc/tcg-target-reg-bits.h
  create mode 100644 tcg/riscv/tcg-target-reg-bits.h
  create mode 100644 tcg/s390x/tcg-target-reg-bits.h
  create mode 100644 tcg/sparc64/tcg-target-reg-bits.h
  create mode 100644 tcg/tci/tcg-target-reg-bits.h
  create mode 100644 include/exec/helper-gen.h.inc
  create mode 100644 include/exec/helper-proto.h.inc
  create mode 100644 include/exec/helper-info.c.inc

-New patch
+[PULL 01/72] tests/tcg: Do not use inttypes.h in multiarch/system/memory.c
+From: Ilya Leoshkevich <iii@linux.ibm.com>
+make check-tcg fails on Fedora with the following error message:
+    alpha-linux-gnu-gcc [...] qemu/tests/tcg/multiarch/system/memory.c -o memory [...]
+    qemu/tests/tcg/multiarch/system/memory.c:17:10: fatal error: inttypes.h: No such file or directory
+| #include <inttypes.h>
+          |          ^~~~~~~~~~~~
+    compilation terminated.
+The reason is that Fedora has cross-compilers, but no cross-glibc
+headers. Fix by hardcoding the format specifiers and dropping the
+include.
+An alternative fix would be to introduce a configure check for
+inttypes.h. But this would make it impossible to use Fedora
+cross-compilers for softmmu tests, which used to work so far.
+Fixes: ecbcc9ead2f8 ("tests/tcg: add a system test to check memory instrumentation")
+Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-ID: <20241010085906.226249-1-iii@linux.ibm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tests/tcg/multiarch/system/memory.c | 9 ++++-----
+file changed, 4 insertions(+), 5 deletions(-)
+diff --git a/tests/tcg/multiarch/system/memory.c b/tests/tcg/multiarch/system/memory.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/tcg/multiarch/system/memory.c
++++ b/tests/tcg/multiarch/system/memory.c
+@@ -XXX,XX +XXX,XX @@
+ #include <stdint.h>
+ #include <stdbool.h>
+-#include <inttypes.h>
+ #include <minilib.h>
+ #ifndef CHECK_UNALIGNED
+@@ -XXX,XX +XXX,XX @@ int main(void)
+     int i;
+     bool ok = true;
+-    ml_printf("Test data start: 0x%"PRIxPTR"\n", &test_data[0]);
+-    ml_printf("Test data end: 0x%"PRIxPTR"\n", &test_data[TEST_SIZE]);
++    ml_printf("Test data start: 0x%lx\n", (unsigned long)&test_data[0]);
++    ml_printf("Test data end: 0x%lx\n", (unsigned long)&test_data[TEST_SIZE]);
+     /* Run through the unsigned tests first */
+     for (i = 0; i < ARRAY_SIZE(init_ufns) && ok; i++) {
+@@ -XXX,XX +XXX,XX @@ int main(void)
+         ok = do_signed_reads(true);
+     }
+-    ml_printf("Test data read: %"PRId32"\n", test_read_count);
+-    ml_printf("Test data write: %"PRId32"\n", test_write_count);
++    ml_printf("Test data read: %lu\n", (unsigned long)test_read_count);
++    ml_printf("Test data write: %lu\n", (unsigned long)test_write_count);
+     ml_printf("Test complete: %s\n", ok ? "PASSED" : "FAILED");
+     return ok ? 0 : -1;
+ }
+--
+.43.0

-[PULL 46/52] tcg: Remove target-specific headers from tcg.[ch]
+[PULL 02/72] plugins: optimize cpu_index code generation
-This finally paves the way for tcg/ to be built once per mode.
+From: Pierrick Bouvier <pierrick.bouvier@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+When running with a single vcpu, we can return a constant instead of a
 load when accessing cpu_index.
 A side effect is that all tcg operations using it are optimized, most
 notably scoreboard access.
 When running a simple loop in user-mode, the speedup is around 20%.
 Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Message-ID: <20241128213843.1023080-1-pierrick.bouvier@linaro.org>
 ---
- include/tcg/tcg.h      | 1 -
+ accel/tcg/plugin-gen.c | 9 +++++++++
- accel/tcg/plugin-gen.c | 1 +
+file changed, 9 insertions(+)
  tcg/region.c           | 2 +-
  tcg/tcg-op.c           | 2 +-
  tcg/tcg.c              | 2 +-
 files changed, 4 insertions(+), 4 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
-+++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@
- #ifndef TCG_H
- #define TCG_H
--#include "cpu.h"
- #include "exec/memop.h"
- #include "exec/memopidx.h"
- #include "qemu/bitops.h"
 diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/plugin-gen.c
 +++ b/accel/tcg/plugin-gen.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void gen_disable_mem_helper(void)
-  * CPU's index into a TCG temp, since the first callback did it already.
-  */
+ static TCGv_i32 gen_cpu_index(void)
- #include "qemu/osdep.h"
+ {
-+#include "cpu.h"
++    /*
- #include "tcg/tcg.h"
++     * Optimize when we run with a single vcpu. All values using cpu_index,
- #include "tcg/tcg-temp-internal.h"
++     * including scoreboard index, will be optimized out.
- #include "tcg/tcg-op.h"
++     * User-mode calls tb_flush when setting this flag. In system-mode, all
-diff --git a/tcg/region.c b/tcg/region.c
++     * vcpus are created before generating code.
-index XXXXXXX..XXXXXXX 100644
++     */
---- a/tcg/region.c
++    if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) {
-+++ b/tcg/region.c
++        return tcg_constant_i32(current_cpu->cpu_index);
-@@ -XXX,XX +XXX,XX @@
++    }
- #include "qemu/cacheinfo.h"
+     TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
- #include "qemu/qtree.h"
+     tcg_gen_ld_i32(cpu_index, tcg_env,
- #include "qapi/error.h"
+                    -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
 -#include "exec/exec-all.h"
  #include "tcg/tcg.h"
 +#include "exec/translation-block.h"
  #include "tcg-internal.h"
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "exec/exec-all.h"
  #include "tcg/tcg.h"
  #include "tcg/tcg-temp-internal.h"
  #include "tcg/tcg-op-common.h"
 +#include "exec/translation-block.h"
  #include "exec/plugin-gen.h"
  #include "tcg-internal.h"
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/cacheflush.h"
  #include "qemu/cacheinfo.h"
  #include "qemu/timer.h"
 -#include "exec/exec-all.h"
 +#include "exec/translation-block.h"
  #include "exec/tlb-common.h"
  #include "tcg/tcg-op-common.h"
 --
-.34.1
+.43.0

-New patch
+[PULL 03/72] tcg/optimize: Split out finish_bb, finish_ebb
+Call them directly from the opcode switch statement in tcg_optimize,
+rather than in finish_folding based on opcode flags.  Adjust folding
+of conditional branches to match.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 47 +++++++++++++++++++++++++++++++----------------
+file changed, 31 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static void finish_bb(OptContext *ctx)
++{
++    /* We only optimize memory barriers across basic blocks. */
++    ctx->prev_mb = NULL;
++}
++
++static void finish_ebb(OptContext *ctx)
++{
++    finish_bb(ctx);
++    /* We only optimize across extended basic blocks. */
++    memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
++    remove_mem_copy_all(ctx);
++}
++
+ static void finish_folding(OptContext *ctx, TCGOp *op)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+     int i, nb_oargs;
+-    /*
+-     * We only optimize extended basic blocks.  If the opcode ends a BB
+-     * and is not a conditional branch, reset all temp data.
+-     */
+-    if (def->flags & TCG_OPF_BB_END) {
+-        ctx->prev_mb = NULL;
+-        if (!(def->flags & TCG_OPF_COND_BRANCH)) {
+-            memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+-            remove_mem_copy_all(ctx);
+-        }
+-        return;
+-    }
+-
+     nb_oargs = def->nb_oargs;
+     for (i = 0; i < nb_oargs; i++) {
+         TCGTemp *ts = arg_temp(op->args[i]);
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+     if (i > 0) {
+         op->opc = INDEX_op_br;
+         op->args[0] = op->args[3];
++        finish_ebb(ctx);
++    } else {
++        finish_bb(ctx);
+     }
+-    return false;
++    return true;
+ }
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+         }
+         op->opc = INDEX_op_br;
+         op->args[0] = label;
+-        break;
++        finish_ebb(ctx);
++        return true;
+     }
+-    return false;
++
++    finish_bb(ctx);
++    return true;
+ }
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
++        case INDEX_op_set_label:
++        case INDEX_op_br:
++        case INDEX_op_exit_tb:
++        case INDEX_op_goto_tb:
++        case INDEX_op_goto_ptr:
++            finish_ebb(&ctx);
++            done = true;
++            break;
+         default:
+             break;
+         }
+--
+.43.0

-[PULL 24/52] tcg: Split helper-proto.h
+[PULL 04/72] tcg/optimize: Split out fold_affected_mask
-Create helper-proto-common.h without the target specific portion.
+There are only a few logical operations which can compute
-Use that in tcg-op-common.h.  Include helper-proto.h in target/arm
+an "affected" mask.  Split out handling of this optimization
-and target/hexagon before helper-info.c.inc; all other targets are
+to a separate function, only to be called when applicable.
 already correct in this regard.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Remove the a_mask field from OptContext, as the mask is
 no longer stored anywhere.
 Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-proto-common.h | 18 ++++++++
+ tcg/optimize.c | 42 +++++++++++++++++++++++++++---------------
- include/exec/helper-proto.h        | 73 ++++--------------------------
+file changed, 27 insertions(+), 15 deletions(-)
  include/tcg/tcg-op-common.h        |  2 +-
  include/exec/helper-proto.h.inc    | 68 ++++++++++++++++++++++++++++
  accel/tcg/cputlb.c                 |  3 +-
  accel/tcg/plugin-gen.c             |  2 +-
  accel/tcg/tcg-runtime-gvec.c       |  2 +-
  accel/tcg/tcg-runtime.c            |  2 +-
  target/arm/tcg/translate.c         |  1 +
  target/hexagon/translate.c         |  1 +
 files changed, 102 insertions(+), 70 deletions(-)
  create mode 100644 include/exec/helper-proto-common.h
  create mode 100644 include/exec/helper-proto.h.inc
-diff --git a/include/exec/helper-proto-common.h b/include/exec/helper-proto-common.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/helper-proto-common.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Helper file for declaring TCG helper functions.
 + * This one expands prototypes for the helper functions.
 + */
 +
 +#ifndef HELPER_PROTO_COMMON_H
 +#define HELPER_PROTO_COMMON_H
 +
 +#define HELPER_H "accel/tcg/tcg-runtime.h"
 +#include "exec/helper-proto.h.inc"
 +#undef  HELPER_H
 +
 +#define HELPER_H "accel/tcg/plugin-helpers.h"
 +#include "exec/helper-proto.h.inc"
 +#undef  HELPER_H
 +
 +#endif /* HELPER_PROTO_COMMON_H */
 diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-proto.h
+--- a/tcg/optimize.c
-+++ b/include/exec/helper-proto.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
--/* Helper file for declaring TCG helper functions.
+     QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
--   This one expands prototypes for the helper functions.  */
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
+     /* In flight values from optimization. */
-+/*
+-    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
-+ * Helper file for declaring TCG helper functions.
+     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-+ * This one expands prototypes for the helper functions.
+     uint64_t s_mask;  /* mask of clrsb(value) bits */
-+ */
+     TCGType type;
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
- #ifndef HELPER_PROTO_H
- #define HELPER_PROTO_H
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
+ {
--#include "exec/helper-head.h"
+-    uint64_t a_mask = ctx->a_mask;
-+#include "exec/helper-proto-common.h"
+     uint64_t z_mask = ctx->z_mask;
+     uint64_t s_mask = ctx->s_mask;
--/*
-- * Work around an issue with --enable-lto, in which GCC's ipa-split pass
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
-- * decides to split out the noreturn code paths that raise an exception,
+      * type changing opcodes.
-- * taking the __builtin_return_address() along into the new function,
+      */
-- * where it no longer computes a value that returns to TCG generated code.
+     if (ctx->type == TCG_TYPE_I32) {
-- * Despite the name, the noinline attribute affects splitter, so this
+-        a_mask = (int32_t)a_mask;
-- * prevents the optimization in question.  Given that helpers should not
+         z_mask = (int32_t)z_mask;
-- * otherwise be called directly, this should have any other visible effect.
+         s_mask |= MAKE_64BIT_MASK(32, 32);
-- *
+         ctx->z_mask = z_mask;
-- * See https://gitlab.com/qemu-project/qemu/-/issues/1454
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
-- */
+     if (z_mask == 0) {
--#define DEF_HELPER_ATTR  __attribute__((noinline))
+         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
--
+     }
--#define DEF_HELPER_FLAGS_0(name, flags, ret) \
++    return false;
--dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
++}
 -
 -#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
 -
 -#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
 -
 -#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
 -                            dh_ctype(t3)) DEF_HELPER_ATTR;
 -
 -#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                            dh_ctype(t4)) DEF_HELPER_ATTR;
 -
 -#define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
 -
 -#define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                            dh_ctype(t4), dh_ctype(t5), \
 -                            dh_ctype(t6)) DEF_HELPER_ATTR;
 -
 -#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
 -dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
 -                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
 -                            dh_ctype(t7)) DEF_HELPER_ATTR;
 -
 -#define IN_HELPER_PROTO
 -
 -#include "helper.h"
 -#include "accel/tcg/tcg-runtime.h"
 -#include "accel/tcg/plugin-helpers.h"
 -
 -#undef IN_HELPER_PROTO
 -
 -#undef DEF_HELPER_FLAGS_0
 -#undef DEF_HELPER_FLAGS_1
 -#undef DEF_HELPER_FLAGS_2
 -#undef DEF_HELPER_FLAGS_3
 -#undef DEF_HELPER_FLAGS_4
 -#undef DEF_HELPER_FLAGS_5
 -#undef DEF_HELPER_FLAGS_6
 -#undef DEF_HELPER_FLAGS_7
 -#undef DEF_HELPER_ATTR
 +#define HELPER_H "helper.h"
 +#include "exec/helper-proto.h.inc"
 +#undef  HELPER_H
  #endif /* HELPER_PROTO_H */
 diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg-op-common.h
 +++ b/include/tcg/tcg-op-common.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_TCG_OP_COMMON_H
  #include "tcg/tcg.h"
 -#include "exec/helper-proto.h"
 +#include "exec/helper-proto-common.h"
  #include "exec/helper-gen-common.h"
  /* Basic output routines.  Not for general consumption.  */
 diff --git a/include/exec/helper-proto.h.inc b/include/exec/helper-proto.h.inc
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/helper-proto.h.inc
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Helper file for declaring TCG helper functions.
 + * This one expands prototypes for the helper functions.
 + * Define HELPER_H for the header file to be expanded.
 + */
 +
 +#include "exec/helper-head.h"
 +
 +/*
-+ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
++ * An "affected" mask bit is 0 if and only if the result is identical
-+ * decides to split out the noreturn code paths that raise an exception,
++ * to the first input.  Thus if the entire mask is 0, the operation
-+ * taking the __builtin_return_address() along into the new function,
++ * is equivalent to a copy.
 + * where it no longer computes a value that returns to TCG generated code.
 + * Despite the name, the noinline attribute affects splitter, so this
 + * prevents the optimization in question.  Given that helpers should not
 + * otherwise be called directly, this should not have any other visible effect.
 + *
 + * See https://gitlab.com/qemu-project/qemu/-/issues/1454
 + */
-+#define DEF_HELPER_ATTR  __attribute__((noinline))
++static bool fold_affected_mask(OptContext *ctx, TCGOp *op, uint64_t a_mask)
-+
++{
-+#define DEF_HELPER_FLAGS_0(name, flags, ret) \
++    if (ctx->type == TCG_TYPE_I32) {
-+dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
++        a_mask = (uint32_t)a_mask;
-+
++    }
-+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+     if (a_mask == 0) {
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-+
+     }
-+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
+      * Known-zeros does not imply known-ones.  Therefore unless
-+
+      * arg2 is constant, we can't infer affected bits from it.
-+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+      */
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
+-    if (arg_is_const(op->args[2])) {
-+                            dh_ctype(t3)) DEF_HELPER_ATTR;
+-        ctx->a_mask = z1 & ~z2;
-+
++    if (arg_is_const(op->args[2]) &&
-+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
++        fold_affected_mask(ctx, op, z1 & ~z2)) {
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
++        return true;
-+                            dh_ctype(t4)) DEF_HELPER_ATTR;
+     }
-+
-+#define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
+     return fold_masks(ctx, op);
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
+      */
-+
+     if (arg_is_const(op->args[2])) {
-+#define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
+         uint64_t z2 = ~arg_info(op->args[2])->z_mask;
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+-        ctx->a_mask = z1 & ~z2;
-+                            dh_ctype(t4), dh_ctype(t5), \
++        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
-+                            dh_ctype(t6)) DEF_HELPER_ATTR;
++            return true;
-+
++        }
-+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+         z1 &= z2;
-+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+     }
-+                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+     ctx->z_mask = z1;
-+                            dh_ctype(t7)) DEF_HELPER_ATTR;
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
-+
-+#define IN_HELPER_PROTO
+     z_mask_old = arg_info(op->args[1])->z_mask;
-+
+     z_mask = extract64(z_mask_old, pos, len);
-+#include HELPER_H
+-    if (pos == 0) {
-+
+-        ctx->a_mask = z_mask_old ^ z_mask;
-+#undef IN_HELPER_PROTO
++    if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
-+
++        return true;
-+#undef DEF_HELPER_FLAGS_0
+     }
-+#undef DEF_HELPER_FLAGS_1
+     ctx->z_mask = z_mask;
-+#undef DEF_HELPER_FLAGS_2
+     ctx->s_mask = smask_from_zmask(z_mask);
-+#undef DEF_HELPER_FLAGS_3
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
-+#undef DEF_HELPER_FLAGS_4
-+#undef DEF_HELPER_FLAGS_5
+     ctx->z_mask = z_mask;
-+#undef DEF_HELPER_FLAGS_6
+     ctx->s_mask = s_mask;
-+#undef DEF_HELPER_FLAGS_7
+-    if (!type_change) {
-+#undef DEF_HELPER_ATTR
+-        ctx->a_mask = s_mask & ~s_mask_old;
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
++    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-index XXXXXXX..XXXXXXX 100644
++        return true;
---- a/accel/tcg/cputlb.c
+     }
-+++ b/accel/tcg/cputlb.c
-@@ -XXX,XX +XXX,XX @@
+     return fold_masks(ctx, op);
- #include "tcg/tcg.h"
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
- #include "qemu/error-report.h"
- #include "exec/log.h"
+     ctx->z_mask = z_mask;
--#include "exec/helper-proto.h"
+     ctx->s_mask = smask_from_zmask(z_mask);
-+#include "exec/helper-proto-common.h"
+-    if (!type_change) {
- #include "qemu/atomic.h"
+-        ctx->a_mask = z_mask_old ^ z_mask;
- #include "qemu/atomic128.h"
++    if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
- #include "exec/translate-all.h"
++        return true;
-@@ -XXX,XX +XXX,XX @@
+     }
- #endif
+     return fold_masks(ctx, op);
- #include "tcg/tcg-ldst.h"
+ }
- #include "tcg/oversized-guest.h"
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
--#include "exec/helper-proto.h"
+     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+     ctx->s_mask = s_mask;
- /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
- /* #define DEBUG_TLB */
+-    if (pos == 0) {
-diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
+-        ctx->a_mask = s_mask & ~s_mask_old;
-index XXXXXXX..XXXXXXX 100644
++    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
---- a/accel/tcg/plugin-gen.c
++        return true;
-+++ b/accel/tcg/plugin-gen.c
+     }
-@@ -XXX,XX +XXX,XX @@
- #include "exec/exec-all.h"
+     return fold_masks(ctx, op);
- #include "exec/plugin-gen.h"
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #include "exec/translator.h"
+         }
--#include "exec/helper-proto.h"
-+#include "exec/helper-proto-common.h"
+         /* Assume all bits affected, no bits known zero, no sign reps. */
+-        ctx.a_mask = -1;
- #define HELPER_H  "accel/tcg/plugin-helpers.h"
+         ctx.z_mask = -1;
- #include "exec/helper-info.c.inc"
+         ctx.s_mask = 0;
-diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-runtime-gvec.c
 +++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu/host-utils.h"
  #include "cpu.h"
 -#include "exec/helper-proto.h"
 +#include "exec/helper-proto-common.h"
  #include "tcg/tcg-gvec-desc.h"
 diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-runtime.c
 +++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "qemu/host-utils.h"
  #include "cpu.h"
 -#include "exec/helper-proto.h"
 +#include "exec/helper-proto-common.h"
  #include "exec/cpu_ldst.h"
  #include "exec/exec-all.h"
  #include "disas/disas.h"
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "translate.h"
  #include "translate-a32.h"
  #include "exec/gen-icount.h"
 +#include "exec/helper-proto.h"
  #define HELPER_H "helper.h"
  #include "exec/helper-info.c.inc"
 diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/translate.c
 +++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg-op.h"
  #include "tcg/tcg-op-gvec.h"
  #include "exec/helper-gen.h"
 +#include "exec/helper-proto.h"
  #include "exec/cpu_ldst.h"
  #include "exec/log.h"
  #include "internal.h"
 --
-.34.1
+.43.0

-[PULL 40/52] target/mips: Tidy helpers for translation
+[PULL 05/72] tcg/optimize: Copy mask writeback to fold_masks
-Move most includes from *translate*.c to translate.h, ensuring
+Use of fold_masks should be restricted to those opcodes that
-that we get the ordering correct.  Ensure cpu.h is first.
+can reliably make use of it -- those with a single output,
-Use disas/disas.h instead of exec/log.h.
+and from higher-level folders that set up the masks.
-Drop otherwise unused includes.
+Prepare for conversion of each folder in turn.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/mips/tcg/translate.h            |  6 ++++--
+ tcg/optimize.c | 17 ++++++++++++++---
- target/mips/tcg/msa_translate.c        |  3 ---
+file changed, 14 insertions(+), 3 deletions(-)
  target/mips/tcg/mxu_translate.c        |  2 --
  target/mips/tcg/octeon_translate.c     |  4 +---
  target/mips/tcg/rel6_translate.c       |  2 --
  target/mips/tcg/translate.c            | 18 ++++++------------
  target/mips/tcg/translate_addr_const.c |  1 -
  target/mips/tcg/tx79_translate.c       |  4 +---
  target/mips/tcg/vr54xx_translate.c     |  3 ---
 files changed, 12 insertions(+), 31 deletions(-)
-diff --git a/target/mips/tcg/translate.h b/target/mips/tcg/translate.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.h
+--- a/tcg/optimize.c
-+++ b/target/mips/tcg/translate.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
- #ifndef TARGET_MIPS_TRANSLATE_H
+ {
- #define TARGET_MIPS_TRANSLATE_H
+     uint64_t z_mask = ctx->z_mask;
+     uint64_t s_mask = ctx->s_mask;
--#include "qemu/log.h"
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
--#include "exec/translator.h"
++    TCGTemp *ts;
-+#include "cpu.h"
++    TempOptInfo *ti;
- #include "tcg/tcg-op.h"
++
-+#include "exec/translator.h"
++    /* Only single-output opcodes are supported here. */
-+#include "exec/helper-gen.h"
++    tcg_debug_assert(def->nb_oargs == 1);
-+#include "qemu/log.h"
+     /*
- #define MIPS_DEBUG_DISAS 0
+      * 32-bit ops generate 32-bit results, which for the purpose of
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
-diff --git a/target/mips/tcg/msa_translate.c b/target/mips/tcg/msa_translate.c
+     if (ctx->type == TCG_TYPE_I32) {
-index XXXXXXX..XXXXXXX 100644
+         z_mask = (int32_t)z_mask;
---- a/target/mips/tcg/msa_translate.c
+         s_mask |= MAKE_64BIT_MASK(32, 32);
-+++ b/target/mips/tcg/msa_translate.c
+-        ctx->z_mask = z_mask;
-@@ -XXX,XX +XXX,XX @@
+-        ctx->s_mask = s_mask;
-  * SPDX-License-Identifier: LGPL-2.1-or-later
+     }
-  */
- #include "qemu/osdep.h"
+     if (z_mask == 0) {
--#include "tcg/tcg-op.h"
+         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
--#include "exec/helper-gen.h"
+     }
- #include "translate.h"
+-    return false;
- #include "fpu_helper.h"
++
--#include "internal.h"
++    ts = arg_temp(op->args[0]);
++    reset_ts(ctx, ts);
- static int elm_n(DisasContext *ctx, int x);
++
- static int elm_df(DisasContext *ctx, int x);
++    ti = ts_info(ts);
-diff --git a/target/mips/tcg/mxu_translate.c b/target/mips/tcg/mxu_translate.c
++    ti->z_mask = z_mask;
-index XXXXXXX..XXXXXXX 100644
++    ti->s_mask = s_mask;
---- a/target/mips/tcg/mxu_translate.c
++    return true;
-+++ b/target/mips/tcg/mxu_translate.c
+ }
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "tcg/tcg-op.h"
 -#include "exec/helper-gen.h"
  #include "translate.h"
  /*
-diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/octeon_translate.c
-+++ b/target/mips/tcg/octeon_translate.c
-@@ -XXX,XX +XXX,XX @@
-  */
- #include "qemu/osdep.h"
--#include "tcg/tcg-op.h"
--#include "tcg/tcg-op-gvec.h"
--#include "exec/helper-gen.h"
- #include "translate.h"
-+#include "tcg/tcg-op-gvec.h"
- /* Include the auto-generated decoder.  */
- #include "decode-octeon.c.inc"
-diff --git a/target/mips/tcg/rel6_translate.c b/target/mips/tcg/rel6_translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/rel6_translate.c
-+++ b/target/mips/tcg/rel6_translate.c
-@@ -XXX,XX +XXX,XX @@
-  */
- #include "qemu/osdep.h"
--#include "tcg/tcg-op.h"
--#include "exec/helper-gen.h"
- #include "translate.h"
- /* Include the auto-generated decoders.  */
-diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.c
-+++ b/target/mips/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@
-  */
- #include "qemu/osdep.h"
--#include "cpu.h"
--#include "internal.h"
--#include "tcg/tcg-op.h"
--#include "exec/translator.h"
--#include "exec/helper-proto.h"
--#include "exec/helper-gen.h"
--#include "semihosting/semihost.h"
--
--#include "trace.h"
--#include "exec/log.h"
--#include "qemu/qemu-print.h"
--#include "fpu_helper.h"
- #include "translate.h"
-+#include "internal.h"
-+#include "exec/helper-proto.h"
-+#include "semihosting/semihost.h"
-+#include "trace.h"
-+#include "disas/disas.h"
-+#include "fpu_helper.h"
- #define HELPER_H "helper.h"
- #include "exec/helper-info.c.inc"
-diff --git a/target/mips/tcg/translate_addr_const.c b/target/mips/tcg/translate_addr_const.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate_addr_const.c
-+++ b/target/mips/tcg/translate_addr_const.c
-@@ -XXX,XX +XXX,XX @@
-  * SPDX-License-Identifier: LGPL-2.1-or-later
-  */
- #include "qemu/osdep.h"
--#include "tcg/tcg-op.h"
- #include "translate.h"
- bool gen_lsa(DisasContext *ctx, int rd, int rt, int rs, int sa)
-diff --git a/target/mips/tcg/tx79_translate.c b/target/mips/tcg/tx79_translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/tx79_translate.c
-+++ b/target/mips/tcg/tx79_translate.c
-@@ -XXX,XX +XXX,XX @@
-  */
- #include "qemu/osdep.h"
--#include "tcg/tcg-op.h"
--#include "tcg/tcg-op-gvec.h"
--#include "exec/helper-gen.h"
- #include "translate.h"
-+#include "tcg/tcg-op-gvec.h"
- /* Include the auto-generated decoder.  */
- #include "decode-tx79.c.inc"
-diff --git a/target/mips/tcg/vr54xx_translate.c b/target/mips/tcg/vr54xx_translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/vr54xx_translate.c
-+++ b/target/mips/tcg/vr54xx_translate.c
-@@ -XXX,XX +XXX,XX @@
-  */
- #include "qemu/osdep.h"
--#include "tcg/tcg-op.h"
--#include "exec/helper-gen.h"
- #include "translate.h"
--#include "internal.h"
- /* Include the auto-generated decoder. */
- #include "decode-vr54xx.c.inc"
 --
-.34.1
+.43.0

-[PULL 25/52] target/sh4: Emit insn_start for each insn in gUSA region
+[PULL 06/72] tcg/optimize: Split out fold_masks_zs
-Fixes an assert in tcg_gen_code that we don't accidentally
+Add a routine to which masks can be passed directly, rather than
-eliminate an insn_start during optimization.
+storing them into OptContext.  To be used in upcoming patches.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sh4/translate.c | 15 ++++++++++++---
+ tcg/optimize.c | 15 ++++++++++++---
 file changed, 12 insertions(+), 3 deletions(-)
-diff --git a/target/sh4/translate.c b/target/sh4/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/translate.c
+--- a/tcg/optimize.c
-+++ b/target/sh4/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
-     /* The entire region has been translated.  */
+ }
-     ctx->envflags &= ~TB_FLAG_GUSA_MASK;
--    ctx->base.pc_next = pc_end;
+-static bool fold_masks(OptContext *ctx, TCGOp *op)
--    ctx->base.num_insns += max_insns - 1;
++/*
--    return;
++ * Record "zero" and "sign" masks for the single output of @op.
-+    goto done;
++ * See TempOptInfo definition of z_mask and s_mask.
++ * If z_mask allows, fold the output to constant zero.
-  fail:
++ */
-     qemu_log_mask(LOG_UNIMP, "Unrecognized gUSA sequence %08x-%08x\n",
++static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
-@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
++                          uint64_t z_mask, uint64_t s_mask)
-        purposes of accounting within the TB.  We might as well report the
+ {
-        entire region consumed via ctx->base.pc_next so that it's immediately
+-    uint64_t z_mask = ctx->z_mask;
-        available in the disassembly dump.  */
+-    uint64_t s_mask = ctx->s_mask;
      const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGTemp *ts;
      TempOptInfo *ti;
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      return true;
  }
 +static bool fold_masks(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
 +}
 +
-+ done:
+ /*
-     ctx->base.pc_next = pc_end;
+  * An "affected" mask bit is 0 if and only if the result is identical
-     ctx->base.num_insns += max_insns - 1;
+  * to the first input.  Thus if the entire mask is 0, the operation
 +
 +    /*
 +     * Emit insn_start to cover each of the insns in the region.
 +     * This matches an assert in tcg.c making sure that we have
 +     * tb->icount * insn_start.
 +     */
 +    for (i = 1; i < max_insns; ++i) {
 +        tcg_gen_insn_start(pc + i * 2, ctx->envflags);
 +    }
  }
  #endif
 --
-.34.1
+.43.0

-[PULL 07/52] tcg: Add tlb_fast_offset to TCGContext
+[PULL 07/72] tcg/optimize: Augment s_mask from z_mask in fold_masks_zs
-Disconnect the layout of ArchCPU from TCG compilation.
+Consider the passed s_mask to be a minimum deduced from
-Pass the relative offset of 'env' and 'neg.tlb.f' as a parameter.
+either existing s_mask or from a sign-extension operation.
 We may be able to deduce more from the set of known zeros.
 Remove identical logic from several opcode folders.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h          | 39 +---------------------
+ tcg/optimize.c | 21 ++++++---------------
- include/exec/tlb-common.h        | 56 ++++++++++++++++++++++++++++++++
+file changed, 6 insertions(+), 15 deletions(-)
  include/tcg/tcg.h                |  1 +
  accel/tcg/translate-all.c        |  2 ++
  tcg/tcg.c                        | 13 ++++++++
  tcg/aarch64/tcg-target.c.inc     |  7 ++--
  tcg/arm/tcg-target.c.inc         |  7 ++--
  tcg/i386/tcg-target.c.inc        |  9 ++---
  tcg/loongarch64/tcg-target.c.inc |  7 ++--
  tcg/mips/tcg-target.c.inc        |  7 ++--
  tcg/ppc/tcg-target.c.inc         |  7 ++--
  tcg/riscv/tcg-target.c.inc       |  7 ++--
  tcg/s390x/tcg-target.c.inc       |  7 ++--
  tcg/sparc64/tcg-target.c.inc     |  7 ++--
 files changed, 110 insertions(+), 66 deletions(-)
  create mode 100644 include/exec/tlb-common.h
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-defs.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
- #define NB_MMU_MODES 16
+  * Record "zero" and "sign" masks for the single output of @op.
+  * See TempOptInfo definition of z_mask and s_mask.
- #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG)
+  * If z_mask allows, fold the output to constant zero.
-+#include "exec/tlb-common.h"
++ * The passed s_mask may be augmented by z_mask.
  /* use a fully associative victim tlb of 8 entries */
  #define CPU_VTLB_SIZE 8
 -#define CPU_TLB_ENTRY_BITS 5
 -
  #define CPU_TLB_DYN_MIN_BITS 6
  #define CPU_TLB_DYN_DEFAULT_BITS 8
@@ -XXX,XX +XXX,XX @@
  #  endif
  # endif
 -/* Minimalized TLB entry for use by TCG fast path. */
 -typedef union CPUTLBEntry {
 -    struct {
 -        uint64_t addr_read;
 -        uint64_t addr_write;
 -        uint64_t addr_code;
 -        /*
 -         * Addend to virtual address to get host address.  IO accesses
 -         * use the corresponding iotlb value.
 -         */
 -        uintptr_t addend;
 -    };
 -    /*
 -     * Padding to get a power of two size, as well as index
 -     * access to addr_{read,write,code}.
 -     */
 -    uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
 -} CPUTLBEntry;
 -
 -QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
 -
  #endif  /* !CONFIG_USER_ONLY && CONFIG_TCG */
  #if !defined(CONFIG_USER_ONLY)
@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBDesc {
      CPUTLBEntryFull *fulltlb;
  } CPUTLBDesc;
 -/*
 - * Data elements that are per MMU mode, accessed by the fast path.
 - * The structure is aligned to aid loading the pair with one insn.
 - */
 -typedef struct CPUTLBDescFast {
 -    /* Contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */
 -    uintptr_t mask;
 -    /* The array of tlb entries itself. */
 -    CPUTLBEntry *table;
 -} CPUTLBDescFast QEMU_ALIGNED(2 * sizeof(void *));
 -
  /*
   * Data elements that are shared between all MMU modes.
   */
-@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLB {
+ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
-     CPUTLBDescFast f[NB_MMU_MODES];
+                           uint64_t z_mask, uint64_t s_mask)
- } CPUTLB;
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
--/* This will be used by TCG backends to compute offsets.  */
+     ti = ts_info(ts);
--#define TLB_MASK_TABLE_OFS(IDX) \
+     ti->z_mask = z_mask;
--    ((int)offsetof(ArchCPU, neg.tlb.f[IDX]) - (int)offsetof(ArchCPU, env))
+-    ti->s_mask = s_mask;
--
++    ti->s_mask = s_mask | smask_from_zmask(z_mask);
  #else
  typedef struct CPUTLB { } CPUTLB;
 diff --git a/include/exec/tlb-common.h b/include/exec/tlb-common.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/tlb-common.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Common definitions for the softmmu tlb
 + *
 + * Copyright (c) 2003 Fabrice Bellard
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +#ifndef EXEC_TLB_COMMON_H
 +#define EXEC_TLB_COMMON_H 1
 +
 +#define CPU_TLB_ENTRY_BITS 5
 +
 +/* Minimalized TLB entry for use by TCG fast path. */
 +typedef union CPUTLBEntry {
 +    struct {
 +        uint64_t addr_read;
 +        uint64_t addr_write;
 +        uint64_t addr_code;
 +        /*
 +         * Addend to virtual address to get host address.  IO accesses
 +         * use the corresponding iotlb value.
 +         */
 +        uintptr_t addend;
 +    };
 +    /*
 +     * Padding to get a power of two size, as well as index
 +     * access to addr_{read,write,code}.
 +     */
 +    uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
 +} CPUTLBEntry;
 +
 +QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
 +
 +/*
 + * Data elements that are per MMU mode, accessed by the fast path.
 + * The structure is aligned to aid loading the pair with one insn.
 + */
 +typedef struct CPUTLBDescFast {
 +    /* Contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */
 +    uintptr_t mask;
 +    /* The array of tlb entries itself. */
 +    CPUTLBEntry *table;
 +} CPUTLBDescFast QEMU_ALIGNED(2 * sizeof(void *));
 +
 +#endif /* EXEC_TLB_COMMON_H */
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
      TCGType addr_type;            /* TCG_TYPE_I32 or TCG_TYPE_I64 */
  #ifdef CONFIG_SOFTMMU
 +    int tlb_fast_offset;
      int page_mask;
      uint8_t page_bits;
      uint8_t tlb_dyn_max_bits;
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      tcg_ctx->page_bits = TARGET_PAGE_BITS;
      tcg_ctx->page_mask = TARGET_PAGE_MASK;
      tcg_ctx->tlb_dyn_max_bits = CPU_TLB_DYN_MAX_BITS;
 +    tcg_ctx->tlb_fast_offset =
 +        (int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
  #endif
   tb_overflow:
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
  #define NO_CPU_IO_DEFS
  #include "exec/exec-all.h"
 +#include "exec/tlb-common.h"
  #include "tcg/tcg-op.h"
  #if UINTPTR_MAX == UINT32_MAX
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
      return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
  }
 +#if defined(CONFIG_SOFTMMU) && !defined(CONFIG_TCG_INTERPRETER)
 +static int tlb_mask_table_ofs(TCGContext *s, int which)
 +{
 +    return s->tlb_fast_offset + which * sizeof(CPUTLBDescFast);
 +}
 +#endif
 +
  /* Signal overflow, starting over with fewer guest insns. */
  static G_NORETURN
  void tcg_raise_tb_overflow(TCGContext *s)
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
      tcg_debug_assert(s->addr_type == TCG_TYPE_I32 ||
                       s->addr_type == TCG_TYPE_I64);
 +
 +#if defined(CONFIG_SOFTMMU) && !defined(CONFIG_TCG_INTERPRETER)
 +    tcg_debug_assert(s->tlb_fast_offset < 0);
 +    tcg_debug_assert(s->tlb_fast_offset >= MIN_TLB_MASK_TABLE_OFS);
 +#endif
  }
  static TCGTemp *tcg_temp_alloc(TCGContext *s)
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      return true;
  }
-+/* We expect to use a 7-bit scaled negative offset from ENV.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
-+#define MIN_TLB_MASK_TABLE_OFS  -512
+     default:
-+
+         g_assert_not_reached();
- /*
+     }
-  * For softmmu, perform the TLB load and compare.
+-    s_mask = smask_from_zmask(z_mask);
-  * For useronly, perform any required alignment tests.
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
++    s_mask = 0;
-                  ? TCG_TYPE_I64 : TCG_TYPE_I32);
+     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+     case TCG_BSWAP_OZ:
-     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
+         break;
--    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
--    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -512);
+     default:
-     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
+         /* The high bits are undefined: force all bits above the sign to 1. */
-     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
+         z_mask |= sign << 1;
-     tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
+-        s_mask = 0;
--                 TLB_MASK_TABLE_OFS(mem_index), 1, 0);
+         break;
-+                 tlb_mask_table_ofs(s, mem_index), 1, 0);
+     }
+     ctx->z_mask = z_mask;
-     /* Extract the TLB index from the address into X0.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
-     tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
+         g_assert_not_reached();
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+     }
-index XXXXXXX..XXXXXXX 100644
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
---- a/tcg/arm/tcg-target.c.inc
+-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
      return true;
  }
 +/* We expect to use an 9-bit sign-magnitude negative offset from ENV.  */
 +#define MIN_TLB_MASK_TABLE_OFS  -256
 +
  static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                                             TCGReg addrlo, TCGReg addrhi,
                                             MemOpIdx oi, bool is_ld)
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      int mem_index = get_mmuidx(oi);
      int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
                          : offsetof(CPUTLBEntry, addr_write);
 -    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
 +    int fast_off = tlb_mask_table_ofs(s, mem_index);
      unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
      TCGReg t_addr;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      ldst->addrhi_reg = addrhi;
      /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
      QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
      QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
      tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline int setup_guest_base_seg(void)
  #endif /* setup_guest_base_seg */
  #endif /* !SOFTMMU */
 +#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
 +
  /*
   * For softmmu, perform the TLB load and compare.
   * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      int trexw = 0, hrexw = 0, tlbrexw = 0;
      unsigned mem_index = get_mmuidx(oi);
      unsigned s_mask = (1 << s_bits) - 1;
 +    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
      int tlb_mask;
      ldst = new_ldst_label(s);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                     s->page_bits - CPU_TLB_ENTRY_BITS);
      tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
 -                         TLB_MASK_TABLE_OFS(mem_index) +
 -                         offsetof(CPUTLBDescFast, mask));
 +                         fast_ofs + offsetof(CPUTLBDescFast, mask));
      tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
 -                         TLB_MASK_TABLE_OFS(mem_index) +
 -                         offsetof(CPUTLBDescFast, table));
 +                         fast_ofs + offsetof(CPUTLBDescFast, table));
      /*
       * If the required alignment is at least as large as the access, simply
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
      return false;
  }
-+/* We expect to use a 12-bit negative offset from ENV.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-+#define MIN_TLB_MASK_TABLE_OFS  -(1 << 11)
+     default:
-+
+         g_assert_not_reached();
- /*
+     }
-  * For softmmu, perform the TLB load and compare.
+-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
   * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
  #ifdef CONFIG_SOFTMMU
      unsigned s_bits = opc & MO_SIZE;
      int mem_index = get_mmuidx(oi);
 -    int fast_ofs = TLB_MASK_TABLE_OFS(mem_index);
 +    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
      int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask);
      int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      ldst->oi = oi;
      ldst->addrlo_reg = addr_reg;
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11));
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs);
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
      return false;
  }
-+/* We expect to use a 16-bit negative offset from ENV.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
-+#define MIN_TLB_MASK_TABLE_OFS  -32768
+         return true;
-+
+     }
- /*
+     ctx->z_mask = z_mask;
-  * For softmmu, perform the TLB load and compare.
+-    ctx->s_mask = smask_from_zmask(z_mask);
-  * For useronly, perform any required alignment tests.
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+     return fold_masks(ctx, op);
  #ifdef CONFIG_SOFTMMU
      unsigned s_mask = (1 << s_bits) - 1;
      int mem_index = get_mmuidx(oi);
 -    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
 +    int fast_off = tlb_mask_table_ofs(s, mem_index);
      int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
      int table_off = fast_off + offsetof(CPUTLBDescFast, table);
      int add_off = offsetof(CPUTLBEntry, addend);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      ldst->addrhi_reg = addrhi;
      /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, TCG_AREG0, table_off);
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
      return aa.atom <= MO_64;
  }
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
-+/* We expect to use a 16-bit negative offset from ENV.  */
+     }
-+#define MIN_TLB_MASK_TABLE_OFS  -32768
-+
+     ctx->z_mask = z_mask;
- /*
+-    ctx->s_mask = smask_from_zmask(z_mask);
-  * For softmmu, perform the TLB load and compare.
+     if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
-  * For useronly, perform any required alignment tests.
+         return true;
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+     }
-     int mem_index = get_mmuidx(oi);
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-     int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
+     int width = 8 * memop_size(mop);
-                         : offsetof(CPUTLBEntry, addr_write);
--    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+     if (width < 64) {
-+    int fast_off = tlb_mask_table_ofs(s, mem_index);
+-        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
-     int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
+-        if (!(mop & MO_SIGN)) {
-     int table_off = fast_off + offsetof(CPUTLBDescFast, table);
++        if (mop & MO_SIGN) {
++            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
++        } else {
-     ldst->addrhi_reg = addrhi;
+             ctx->z_mask = MAKE_64BIT_MASK(0, width);
+-            ctx->s_mask <<= 1;
-     /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
+         }
--    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
+     }
--    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
-     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, mask_off);
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
-     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_AREG0, table_off);
+     fold_setcond_tst_pow2(ctx, op, false);
-diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+     ctx->z_mask = 1;
-index XXXXXXX..XXXXXXX 100644
+-    ctx->s_mask = smask_from_zmask(1);
---- a/tcg/riscv/tcg-target.c.inc
+     return false;
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
      return true;
  }
-+/* We expect to use a 12-bit negative offset from ENV.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-+#define MIN_TLB_MASK_TABLE_OFS  -(1 << 11)
+     }
-+
- /*
+     ctx->z_mask = 1;
-  * For softmmu, perform the TLB load and compare.
+-    ctx->s_mask = smask_from_zmask(1);
-  * For useronly, perform any required alignment tests.
+     return false;
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
-     unsigned s_bits = opc & MO_SIZE;
+  do_setcond_const:
-     unsigned s_mask = (1u << s_bits) - 1;
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
-     int mem_index = get_mmuidx(oi);
+         break;
--    int fast_ofs = TLB_MASK_TABLE_OFS(mem_index);
+     CASE_OP_32_64(ld8u):
-+    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
+         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
-     int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask);
+-        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
-     int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table);
+         break;
-     int compare_mask;
+     CASE_OP_32_64(ld16s):
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
+         ctx->s_mask = MAKE_64BIT_MASK(16, 48);
-     ldst->oi = oi;
+         break;
-     ldst->addrlo_reg = addr_reg;
+     CASE_OP_32_64(ld16u):
+         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
--    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
+-        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
--    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11));
+         break;
-     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs);
+     case INDEX_op_ld32s_i64:
-     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs);
+         ctx->s_mask = MAKE_64BIT_MASK(32, 32);
+         break;
-diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+     case INDEX_op_ld32u_i64:
-index XXXXXXX..XXXXXXX 100644
+         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
---- a/tcg/s390x/tcg-target.c.inc
+-        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
-+++ b/tcg/s390x/tcg-target.c.inc
+         break;
-@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
+     default:
-     return true;
+         g_assert_not_reached();
  }
 +/* We're expecting to use a 20-bit negative offset on the tlb memory ops.  */
 +#define MIN_TLB_MASK_TABLE_OFS  -(1 << 19)
 +
  /*
   * For softmmu, perform the TLB load and compare.
   * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
  #ifdef CONFIG_SOFTMMU
      unsigned s_mask = (1 << s_bits) - 1;
      int mem_index = get_mmuidx(oi);
 -    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
 +    int fast_off = tlb_mask_table_ofs(s, mem_index);
      int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
      int table_off = fast_off + offsetof(CPUTLBDescFast, table);
      int ofs, a_off;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      tcg_out_sh64(s, RSY_SRLG, TCG_TMP0, addr_reg, TCG_REG_NONE,
                   s->page_bits - CPU_TLB_ENTRY_BITS);
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 19));
      tcg_out_insn(s, RXY, NG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, mask_off);
      tcg_out_insn(s, RXY, AG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, table_off);
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
      return true;
  }
 +/* We expect to use a 13-bit negative offset from ENV.  */
 +#define MIN_TLB_MASK_TABLE_OFS  -(1 << 12)
 +
  /*
   * For softmmu, perform the TLB load and compare.
   * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
  #ifdef CONFIG_SOFTMMU
      int mem_index = get_mmuidx(oi);
 -    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
 +    int fast_off = tlb_mask_table_ofs(s, mem_index);
      int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
      int table_off = fast_off + offsetof(CPUTLBDescFast, table);
      int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      int cc;
      /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
 -    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 12));
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T2, TCG_AREG0, mask_off);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T3, TCG_AREG0, table_off);
 --
-.34.1
+.43.0

-[PULL 29/52] tcg: Split tcg/tcg-op-gvec.h
+[PULL 08/72] tcg/optimize: Change representation of s_mask
-Create tcg/tcg-op-gvec-common.h, moving everything that does not
+Change the representation from sign bit repetitions to all bits equal
-concern TARGET_LONG_BITS.  Adjust tcg-op-gvec.c to use the new header.
+to the sign bit, including the sign bit itself.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+The previous format has a problem in that it is difficult to recreate
 a valid sign mask after a shift operation: the "repetitions" part of
 the previous format meant that applying the same shift as for the value
 lead to an off-by-one value.
 The new format, including the sign bit itself, means that the sign mask
 can be manipulated in exactly the same way as the value, canonicalization
 is easier.
 Canonicalize the s_mask in fold_masks_zs, rather than requiring callers
 to do so.  Treat 0 as a non-canonical but typeless input for no sign
 information, which will be reset as appropriate for the data type.
 We can easily fold in the data from z_mask while canonicalizing.
 Temporarily disable optimizations using s_mask while each operation is
 converted to use fold_masks_zs and to the new form.
 Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op-gvec-common.h | 426 +++++++++++++++++++++++++++++
+ tcg/optimize.c | 64 ++++++++++++--------------------------------------
- include/tcg/tcg-op-gvec.h        | 444 +------------------------------
+file changed, 15 insertions(+), 49 deletions(-)
  tcg/tcg-op-gvec.c                |   2 +-
 files changed, 437 insertions(+), 435 deletions(-)
  create mode 100644 include/tcg/tcg-op-gvec-common.h
-diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/tcg/tcg-op-gvec-common.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Target independent generic vector operation expansion
 + *
 + * Copyright (c) 2018 Linaro
 + */
 +
 +#ifndef TCG_TCG_OP_GVEC_COMMON_H
 +#define TCG_TCG_OP_GVEC_COMMON_H
 +
 +/*
 + * "Generic" vectors.  All operands are given as offsets from ENV,
 + * and therefore cannot also be allocated via tcg_global_mem_new_*.
 + * OPRSZ is the byte size of the vector upon which the operation is performed.
 + * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
 + *
 + * All sizes must be 8 or any multiple of 16.
 + * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
 + * Operands may completely, but not partially, overlap.
 + */
 +
 +/* Expand a call to a gvec-style helper, with pointers to two vector
 +   operands, and a descriptor (see tcg-gvec-desc.h).  */
 +typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 +                        gen_helper_gvec_2 *fn);
 +
 +/* Similarly, passing an extra data value.  */
 +typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
 +void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 +                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 +                         gen_helper_gvec_2i *fn);
 +
 +/* Similarly, passing an extra pointer (e.g. env or float_status).  */
 +typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 +                        int32_t data, gen_helper_gvec_2_ptr *fn);
 +
 +/* Similarly, with three vector operands.  */
 +typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 +                        gen_helper_gvec_3 *fn);
 +
 +/* Similarly, with four vector operands.  */
 +typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
 +                               TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 +                        int32_t data, gen_helper_gvec_4 *fn);
 +
 +/* Similarly, with five vector operands.  */
 +typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
 +                               TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 +                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
 +
 +typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
 +                                   TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 +                        int32_t data, gen_helper_gvec_3_ptr *fn);
 +
 +typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
 +                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 +                        uint32_t maxsz, int32_t data,
 +                        gen_helper_gvec_4_ptr *fn);
 +
 +typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
 +                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
 +void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 +                        gen_helper_gvec_5_ptr *fn);
 +
 +/* Expand a gvec operation.  Either inline or out-of-line depending on
 +   the actual vector size and the operations supported by the host.  */
 +typedef struct {
 +    /* Expand inline as a 64-bit or 32-bit integer.
 +       Only one of these will be non-NULL.  */
 +    void (*fni8)(TCGv_i64, TCGv_i64);
 +    void (*fni4)(TCGv_i32, TCGv_i32);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
 +    /* Expand out-of-line helper w/descriptor.  */
 +    gen_helper_gvec_2 *fno;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The data argument to the out-of-line helper.  */
 +    int32_t data;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +    /* Load dest as a 2nd source operand.  */
 +    bool load_dest;
 +} GVecGen2;
 +
 +typedef struct {
 +    /* Expand inline as a 64-bit or 32-bit integer.
 +       Only one of these will be non-NULL.  */
 +    void (*fni8)(TCGv_i64, TCGv_i64, int64_t);
 +    void (*fni4)(TCGv_i32, TCGv_i32, int32_t);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t);
 +    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
 +    gen_helper_gvec_2 *fno;
 +    /* Expand out-of-line helper w/descriptor, data as argument.  */
 +    gen_helper_gvec_2i *fnoi;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +    /* Load dest as a 3rd source operand.  */
 +    bool load_dest;
 +} GVecGen2i;
 +
 +typedef struct {
 +    /* Expand inline as a 64-bit or 32-bit integer.
 +       Only one of these will be non-NULL.  */
 +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
 +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
 +    /* Expand out-of-line helper w/descriptor.  */
 +    gen_helper_gvec_2i *fno;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The data argument to the out-of-line helper.  */
 +    uint32_t data;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +    /* Load scalar as 1st source operand.  */
 +    bool scalar_first;
 +} GVecGen2s;
 +
 +typedef struct {
 +    /* Expand inline as a 64-bit or 32-bit integer.
 +       Only one of these will be non-NULL.  */
 +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
 +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
 +    /* Expand out-of-line helper w/descriptor.  */
 +    gen_helper_gvec_3 *fno;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The data argument to the out-of-line helper.  */
 +    int32_t data;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +    /* Load dest as a 3rd source operand.  */
 +    bool load_dest;
 +} GVecGen3;
 +
 +typedef struct {
 +    /*
 +     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
 +     * non-NULL.
 +     */
 +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
 +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
 +    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
 +    gen_helper_gvec_3 *fno;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +    /* Load dest as a 3rd source operand.  */
 +    bool load_dest;
 +} GVecGen3i;
 +
 +typedef struct {
 +    /* Expand inline as a 64-bit or 32-bit integer.
 +       Only one of these will be non-NULL.  */
 +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
 +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
 +    /* Expand out-of-line helper w/descriptor.  */
 +    gen_helper_gvec_4 *fno;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The data argument to the out-of-line helper.  */
 +    int32_t data;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +    /* Write aofs as a 2nd dest operand.  */
 +    bool write_aofs;
 +} GVecGen4;
 +
 +typedef struct {
 +    /*
 +     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
 +     * non-NULL.
 +     */
 +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
 +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
 +    /* Expand inline with a host vector type.  */
 +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
 +    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
 +    gen_helper_gvec_4 *fno;
 +    /* The optional opcodes, if any, utilized by .fniv.  */
 +    const TCGOpcode *opt_opc;
 +    /* The vector element size, if applicable.  */
 +    uint8_t vece;
 +    /* Prefer i64 to v64.  */
 +    bool prefer_i64;
 +} GVecGen4i;
 +
 +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 +                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
 +void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 +                     uint32_t maxsz, int64_t c, const GVecGen2i *);
 +void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 +                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *);
 +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
 +void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 +                     uint32_t oprsz, uint32_t maxsz, int64_t c,
 +                     const GVecGen3i *);
 +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
 +                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
 +void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
 +                     uint32_t oprsz, uint32_t maxsz, int64_t c,
 +                     const GVecGen4i *);
 +
 +/* Expand a specific vector operation.  */
 +
 +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +
 +/* Saturated arithmetic.  */
 +void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +
 +/* Min/max.  */
 +void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      int64_t c, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                          uint32_t s, uint32_t m);
 +void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
 +                          uint32_t m, uint64_t imm);
 +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
 +                          uint32_t m, TCGv_i32);
 +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
 +                          uint32_t m, TCGv_i64);
 +
 +void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 +
 +/*
 + * Perform vector shift by vector element, modulo the element size.
 + * E.g.  D[i] = A[i] << (B[i] % (8 << vece)).
 + */
 +void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 +
 +void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
 +                      uint32_t aofs, uint32_t bofs,
 +                      uint32_t oprsz, uint32_t maxsz);
 +
 +/*
 + * Perform vector bit select: d = (b & a) | (c & ~a).
 + */
 +void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
 +                         uint32_t bofs, uint32_t cofs,
 +                         uint32_t oprsz, uint32_t maxsz);
 +
 +/*
 + * 64-bit vector operations.  Use these when the register has been allocated
 + * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
 + * OPRSZ = MAXSZ = 8.
 + */
 +
 +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
 +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
 +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
 +
 +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 +
 +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 +
 +void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
 +void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
 +void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
 +void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
 +void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
 +void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
 +void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
 +void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
 +
 +/* 32-bit vector operations. */
 +void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
 +void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
 +
 +void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
 +void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
 +
 +void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 +void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 +void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 +void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 +void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 +void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 +
 +#endif
 diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op-gvec.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg-op-gvec.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
+     QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy;
- /*
+     uint64_t val;
-- * Generic vector operation expansion
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
-+ * Target dependent generic vector operation expansion
+-    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
-  *
++    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
-  * Copyright (c) 2018 Linaro
+ } TempOptInfo;
-- *
-- * This library is free software; you can redistribute it and/or
+ typedef struct OptContext {
-- * modify it under the terms of the GNU Lesser General Public
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-- * License as published by the Free Software Foundation; either
-- * version 2.1 of the License, or (at your option) any later version.
+     /* In flight values from optimization. */
-- *
+     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-- * This library is distributed in the hope that it will be useful,
+-    uint64_t s_mask;  /* mask of clrsb(value) bits */
-- * but WITHOUT ANY WARRANTY; without even the implied warranty of
++    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
-- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     TCGType type;
-- * Lesser General Public License for more details.
+ } OptContext;
-- *
-- * You should have received a copy of the GNU Lesser General Public
+-/* Calculate the smask for a specific value. */
-- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+-static uint64_t smask_from_value(uint64_t value)
-  */
+-{
+-    int rep = clrsb64(value);
- #ifndef TCG_TCG_OP_GVEC_H
+-    return ~(~0ull >> rep);
- #define TCG_TCG_OP_GVEC_H
+-}
 -/*
 - * "Generic" vectors.  All operands are given as offsets from ENV,
 - * and therefore cannot also be allocated via tcg_global_mem_new_*.
 - * OPRSZ is the byte size of the vector upon which the operation is performed.
 - * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
 - *
 - * All sizes must be 8 or any multiple of 16.
 - * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
 - * Operands may completely, but not partially, overlap.
 - */
 +#include "tcg/tcg-op-gvec-common.h"
 -/* Expand a call to a gvec-style helper, with pointers to two vector
 -   operands, and a descriptor (see tcg-gvec-desc.h).  */
 -typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 -                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 -                        gen_helper_gvec_2 *fn);
 -
 -/* Similarly, passing an extra data value.  */
 -typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
 -void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 -                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 -                         gen_helper_gvec_2i *fn);
 -
 -/* Similarly, passing an extra pointer (e.g. env or float_status).  */
 -typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 -                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 -                        int32_t data, gen_helper_gvec_2_ptr *fn);
 -
 -/* Similarly, with three vector operands.  */
 -typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 -                        gen_helper_gvec_3 *fn);
 -
 -/* Similarly, with four vector operands.  */
 -typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
 -                               TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 -                        int32_t data, gen_helper_gvec_4 *fn);
 -
 -/* Similarly, with five vector operands.  */
 -typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
 -                               TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 -                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
 -
 -typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
 -                                   TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 -                        int32_t data, gen_helper_gvec_3_ptr *fn);
 -
 -typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
 -                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 -                        uint32_t maxsz, int32_t data,
 -                        gen_helper_gvec_4_ptr *fn);
 -
 -typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
 -                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
 -void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 -                        uint32_t oprsz, uint32_t maxsz, int32_t data,
 -                        gen_helper_gvec_5_ptr *fn);
 -
 -/* Expand a gvec operation.  Either inline or out-of-line depending on
 -   the actual vector size and the operations supported by the host.  */
 -typedef struct {
 -    /* Expand inline as a 64-bit or 32-bit integer.
 -       Only one of these will be non-NULL.  */
 -    void (*fni8)(TCGv_i64, TCGv_i64);
 -    void (*fni4)(TCGv_i32, TCGv_i32);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
 -    /* Expand out-of-line helper w/descriptor.  */
 -    gen_helper_gvec_2 *fno;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The data argument to the out-of-line helper.  */
 -    int32_t data;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -    /* Load dest as a 2nd source operand.  */
 -    bool load_dest;
 -} GVecGen2;
 -
 -typedef struct {
 -    /* Expand inline as a 64-bit or 32-bit integer.
 -       Only one of these will be non-NULL.  */
 -    void (*fni8)(TCGv_i64, TCGv_i64, int64_t);
 -    void (*fni4)(TCGv_i32, TCGv_i32, int32_t);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t);
 -    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
 -    gen_helper_gvec_2 *fno;
 -    /* Expand out-of-line helper w/descriptor, data as argument.  */
 -    gen_helper_gvec_2i *fnoi;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -    /* Load dest as a 3rd source operand.  */
 -    bool load_dest;
 -} GVecGen2i;
 -
 -typedef struct {
 -    /* Expand inline as a 64-bit or 32-bit integer.
 -       Only one of these will be non-NULL.  */
 -    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
 -    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
 -    /* Expand out-of-line helper w/descriptor.  */
 -    gen_helper_gvec_2i *fno;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The data argument to the out-of-line helper.  */
 -    uint32_t data;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -    /* Load scalar as 1st source operand.  */
 -    bool scalar_first;
 -} GVecGen2s;
 -
 -typedef struct {
 -    /* Expand inline as a 64-bit or 32-bit integer.
 -       Only one of these will be non-NULL.  */
 -    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
 -    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
 -    /* Expand out-of-line helper w/descriptor.  */
 -    gen_helper_gvec_3 *fno;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The data argument to the out-of-line helper.  */
 -    int32_t data;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -    /* Load dest as a 3rd source operand.  */
 -    bool load_dest;
 -} GVecGen3;
 -
 -typedef struct {
 -    /*
 -     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
 -     * non-NULL.
 -     */
 -    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
 -    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
 -    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
 -    gen_helper_gvec_3 *fno;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -    /* Load dest as a 3rd source operand.  */
 -    bool load_dest;
 -} GVecGen3i;
 -
 -typedef struct {
 -    /* Expand inline as a 64-bit or 32-bit integer.
 -       Only one of these will be non-NULL.  */
 -    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
 -    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
 -    /* Expand out-of-line helper w/descriptor.  */
 -    gen_helper_gvec_4 *fno;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The data argument to the out-of-line helper.  */
 -    int32_t data;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -    /* Write aofs as a 2nd dest operand.  */
 -    bool write_aofs;
 -} GVecGen4;
 -
 -typedef struct {
 -    /*
 -     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
 -     * non-NULL.
 -     */
 -    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
 -    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
 -    /* Expand inline with a host vector type.  */
 -    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
 -    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
 -    gen_helper_gvec_4 *fno;
 -    /* The optional opcodes, if any, utilized by .fniv.  */
 -    const TCGOpcode *opt_opc;
 -    /* The vector element size, if applicable.  */
 -    uint8_t vece;
 -    /* Prefer i64 to v64.  */
 -    bool prefer_i64;
 -} GVecGen4i;
 -
 -void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 -                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
 -void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 -                     uint32_t maxsz, int64_t c, const GVecGen2i *);
 -void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 -                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *);
 -void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
 -void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 -                     uint32_t oprsz, uint32_t maxsz, int64_t c,
 -                     const GVecGen3i *);
 -void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
 -                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
 -void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
 -                     uint32_t oprsz, uint32_t maxsz, int64_t c,
 -                     const GVecGen4i *);
 -
 -/* Expand a specific vector operation.  */
 -
 -void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -
 -/* Saturated arithmetic.  */
 -void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -
 -/* Min/max.  */
 -void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      int64_t c, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                          uint32_t s, uint32_t m);
 -void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
 -                          uint32_t m, uint64_t imm);
 -void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
 -                          uint32_t m, TCGv_i32);
 -void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
 -                          uint32_t m, TCGv_i64);
 -
 -#if TARGET_LONG_BITS == 64
 -# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
 -#else
 -# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
 +#ifndef TARGET_LONG_BITS
 +#error must include QEMU headers
  #endif
 -void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
 -
 -void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 -void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
 -                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
 -
 -/*
-- * Perform vector shift by vector element, modulo the element size.
+- * Calculate the smask for a given set of known-zeros.
-- * E.g.  D[i] = A[i] << (B[i] % (8 << vece)).
+- * If there are lots of zeros on the left, we can consider the remainder
 - * an unsigned field, and thus the corresponding signed field is one bit
 - * larger.
 - */
--void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
+-static uint64_t smask_from_zmask(uint64_t zmask)
--                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+-{
--void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
+-    /*
--                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+-     * Only the 0 bits are significant for zmask, thus the msb itself
--void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
+-     * must be zero, else we have no sign information.
--                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+-     */
--void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
+-    int rep = clz64(zmask);
--                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+-    if (rep == 0) {
--void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
+-        return 0;
--                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+-    }
--
+-    rep -= 1;
--void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
+-    return ~(~0ull >> rep);
--                      uint32_t aofs, uint32_t bofs,
+-}
 -                      uint32_t oprsz, uint32_t maxsz);
 -
 -/*
-- * Perform vector bit select: d = (b & a) | (c & ~a).
+- * Recreate a properly left-aligned smask after manipulation.
 - * Some bit-shuffling, particularly shifts and rotates, may
 - * retain sign bits on the left, but may scatter disconnected
 - * sign bits on the right.  Retain only what remains to the left.
 - */
--void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
+-static uint64_t smask_from_smask(int64_t smask)
--                         uint32_t bofs, uint32_t cofs,
+-{
--                         uint32_t oprsz, uint32_t maxsz);
+-    /* Only the 1 bits are significant for smask */
 -    return smask_from_zmask(~smask);
 -}
 -
--/*
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-- * 64-bit vector operations.  Use these when the register has been allocated
+ {
-- * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
+     return ts->state_ptr;
-- * OPRSZ = MAXSZ = 8.
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-- */
+         ti->is_const = true;
--
+         ti->val = ts->val;
--void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
+         ti->z_mask = ts->val;
--void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
+-        ti->s_mask = smask_from_value(ts->val);
--void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
++        ti->s_mask = INT64_MIN >> clrsb64(ts->val);
--
+     } else {
--void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+         ti->is_const = false;
--void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+         ti->z_mask = -1;
--void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
--
+          */
--void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+         if (i == 0) {
--void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+             ts_info(ts)->z_mask = ctx->z_mask;
--void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+-            ts_info(ts)->s_mask = ctx->s_mask;
--
+         }
--void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+     }
--void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+ }
--void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
--void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+  * The passed s_mask may be augmented by z_mask.
--void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+  */
--void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
--void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
+-                          uint64_t z_mask, uint64_t s_mask)
--void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
++                          uint64_t z_mask, int64_t s_mask)
--
+ {
--/* 32-bit vector operations. */
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
--void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+     TCGTemp *ts;
--void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+     TempOptInfo *ti;
--
++    int rep;
--void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
--void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+     /* Only single-output opcodes are supported here. */
--
+     tcg_debug_assert(def->nb_oargs == 1);
--void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
--void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+      */
--void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+     if (ctx->type == TCG_TYPE_I32) {
--void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+         z_mask = (int32_t)z_mask;
--void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+-        s_mask |= MAKE_64BIT_MASK(32, 32);
--void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
++        s_mask |= INT32_MIN;
--
+     }
- #if TARGET_LONG_BITS == 64
-+#define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
+     if (z_mask == 0) {
- #define tcg_gen_vec_add8_tl  tcg_gen_vec_add8_i64
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
- #define tcg_gen_vec_sub8_tl  tcg_gen_vec_sub8_i64
- #define tcg_gen_vec_add16_tl tcg_gen_vec_add16_i64
+     ti = ts_info(ts);
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+     ti->z_mask = z_mask;
- #define tcg_gen_vec_shl16i_tl tcg_gen_vec_shl16i_i64
+-    ti->s_mask = s_mask | smask_from_zmask(z_mask);
- #define tcg_gen_vec_shr16i_tl tcg_gen_vec_shr16i_i64
++
- #define tcg_gen_vec_sar16i_tl tcg_gen_vec_sar16i_i64
++    /* Canonicalize s_mask and incorporate data from z_mask. */
--
++    rep = clz64(~s_mask);
--#else
++    rep = MAX(rep, clz64(z_mask));
-+#elif TARGET_LONG_BITS == 32
++    rep = MAX(rep - 1, 0);
-+#define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
++    ti->s_mask = INT64_MIN >> rep;
- #define tcg_gen_vec_add8_tl  tcg_gen_vec_add8_i32
++
- #define tcg_gen_vec_sub8_tl  tcg_gen_vec_sub8_i32
+     return true;
- #define tcg_gen_vec_add16_tl tcg_gen_vec_add16_i32
+ }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
- #define tcg_gen_vec_shl16i_tl tcg_gen_vec_shl16i_i32
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
- #define tcg_gen_vec_shr16i_tl tcg_gen_vec_shr16i_i32
- #define tcg_gen_vec_sar16i_tl tcg_gen_vec_sar16i_i32
+     ctx->z_mask = z_mask;
-+#else
+     ctx->s_mask = s_mask;
-+# error
+-    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
- #endif
++    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+         return true;
- #endif
+     }
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
---- a/tcg/tcg-op-gvec.c
+     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
-+++ b/tcg/tcg-op-gvec.c
+     ctx->s_mask = s_mask;
-@@ -XXX,XX +XXX,XX @@
- #include "tcg/tcg.h"
+-    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
- #include "tcg/tcg-temp-internal.h"
++    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
- #include "tcg/tcg-op-common.h"
+         return true;
--#include "tcg/tcg-op-gvec.h"
+     }
-+#include "tcg/tcg-op-gvec-common.h"
- #include "tcg/tcg-gvec-desc.h"
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+         ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
- #define MAX_UNROLL  4
          s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 -        ctx->s_mask = smask_from_smask(s_mask);
          return fold_masks(ctx, op);
      }
 --
-.34.1
+.43.0

-[PULL 22/52] tcg: Split tcg_gen_callN
+[PULL 09/72] tcg/optimize: Use finish_folding in fold_add, fold_add_vec, fold_addsub2
-Make tcg_gen_callN a static function.  Create tcg_gen_call[0-7]
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 functions for use by helper-gen.h.inc.
 Removes a multiplicty of calls to __stack_chk_fail, saving up
 to 143kiB of .text space as measured on an x86_64 host.
     Old     New Less    %Change
 8888680    8741816    146864    1.65%    qemu-system-aarch64
 5911832    5856152    55680    0.94%    qemu-system-riscv64
 5816728    5767512    49216    0.85%    qemu-system-mips64
 6707832    6659144    48688    0.73%    qemu-system-ppc64
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-gen.h | 40 ++++++++++++++---------------
+ tcg/optimize.c | 9 +++++----
- include/tcg/tcg.h         | 14 +++++++++-
+file changed, 5 insertions(+), 4 deletions(-)
  tcg/tcg.c                 | 54 ++++++++++++++++++++++++++++++++++++++-
 files changed, 86 insertions(+), 22 deletions(-)
-diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-gen.h
+--- a/tcg/optimize.c
-+++ b/include/exec/helper-gen.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static void finish_ebb(OptContext *ctx)
- extern TCGHelperInfo glue(helper_info_, name);                          \
+     remove_mem_copy_all(ctx);
  static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret))        \
  {                                                                       \
 -    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 0, NULL);  \
 +    tcg_gen_call0(&glue(helper_info_, name), dh_retvar(ret));           \
  }
- #define DEF_HELPER_FLAGS_1(name, flags, ret, t1)                        \
+-static void finish_folding(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ extern TCGHelperInfo glue(helper_info_, name);                          \
++static bool finish_folding(OptContext *ctx, TCGOp *op)
- static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+ {
-     dh_arg_decl(t1, 1))                                                 \
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
- {                                                                       \
+     int i, nb_oargs;
--    TCGTemp *args[1] = { dh_arg(t1, 1) };                               \
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
--    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 1, args);  \
+             ts_info(ts)->z_mask = ctx->z_mask;
-+    tcg_gen_call1(&glue(helper_info_, name), dh_retvar(ret),            \
+         }
-+                  dh_arg(t1, 1));                                       \
+     }
 +    return true;
  }
- #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2)                    \
+ /*
-@@ -XXX,XX +XXX,XX @@ extern TCGHelperInfo glue(helper_info_, name);                          \
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
- static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+         fold_xi_to_x(ctx, op, 0)) {
-     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2))                             \
+         return true;
- {                                                                       \
+     }
--    TCGTemp *args[2] = { dh_arg(t1, 1), dh_arg(t2, 2) };                \
+-    return false;
--    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 2, args);  \
++    return finish_folding(ctx, op);
 +    tcg_gen_call2(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2));                        \
  }
- #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3)                \
+ /* We cannot as yet do_constant_folding with vectors. */
-@@ -XXX,XX +XXX,XX @@ extern TCGHelperInfo glue(helper_info_, name);                          \
+@@ -XXX,XX +XXX,XX @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
- static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+         fold_xi_to_x(ctx, op, 0)) {
-     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3))         \
+         return true;
- {                                                                       \
+     }
--    TCGTemp *args[3] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3) }; \
+-    return false;
--    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 3, args);  \
++    return finish_folding(ctx, op);
 +    tcg_gen_call3(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3));         \
  }
- #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4)            \
+ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
-@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
-     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2),                             \
+         op->args[4] = arg_new_constant(ctx, bl);
-     dh_arg_decl(t3, 3), dh_arg_decl(t4, 4))                             \
+         op->args[5] = arg_new_constant(ctx, bh);
- {                                                                       \
+     }
--    TCGTemp *args[4] = { dh_arg(t1, 1), dh_arg(t2, 2),                  \
+-    return false;
--                         dh_arg(t3, 3), dh_arg(t4, 4) };                \
++    return finish_folding(ctx, op);
 -    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 4, args);  \
 +    tcg_gen_call4(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2),                         \
 +                  dh_arg(t3, 3), dh_arg(t4, 4));                        \
  }
- #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5)        \
+ static bool fold_add2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
      dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
      dh_arg_decl(t4, 4), dh_arg_decl(t5, 5))                             \
  {                                                                       \
 -    TCGTemp *args[5] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
 -                         dh_arg(t4, 4), dh_arg(t5, 5) };                \
 -    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 5, args);  \
 +    tcg_gen_call5(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 +                  dh_arg(t4, 4), dh_arg(t5, 5));                        \
  }
  #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6)    \
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
      dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
      dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6))         \
  {                                                                       \
 -    TCGTemp *args[6] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
 -                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6) }; \
 -    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 6, args);  \
 +    tcg_gen_call6(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 +                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6));         \
  }
  #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
      dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
      dh_arg_decl(t7, 7))                                                 \
  {                                                                       \
 -    TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
 -                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),   \
 -                         dh_arg(t7, 7) };                               \
 -    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 7, args);  \
 +    tcg_gen_call7(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 +                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),          \
 +                  dh_arg(t7, 7));                                       \
  }
  #include "helper.h"
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTargetOpDef {
  bool tcg_op_supported(TCGOpcode op);
 -void tcg_gen_callN(TCGHelperInfo *, TCGTemp *ret, int nargs, TCGTemp **args);
 +void tcg_gen_call0(TCGHelperInfo *, TCGTemp *ret);
 +void tcg_gen_call1(TCGHelperInfo *, TCGTemp *ret, TCGTemp *);
 +void tcg_gen_call2(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *);
 +void tcg_gen_call3(TCGHelperInfo *, TCGTemp *ret, TCGTemp *,
 +                   TCGTemp *, TCGTemp *);
 +void tcg_gen_call4(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
 +                   TCGTemp *, TCGTemp *);
 +void tcg_gen_call5(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
 +                   TCGTemp *, TCGTemp *, TCGTemp *);
 +void tcg_gen_call6(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
 +                   TCGTemp *, TCGTemp *, TCGTemp *, TCGTemp *);
 +void tcg_gen_call7(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
 +                   TCGTemp *, TCGTemp *, TCGTemp *, TCGTemp *, TCGTemp *);
  TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs);
  void tcg_op_remove(TCGContext *s, TCGOp *op);
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
  static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs);
 -void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, int nargs, TCGTemp **args)
 +static void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, TCGTemp **args)
  {
      TCGv_i64 extend_free[MAX_CALL_IARGS];
      int n_extend = 0;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, int nargs, TCGTemp **args)
      }
  }
 +void tcg_gen_call0(TCGHelperInfo *info, TCGTemp *ret)
 +{
 +    tcg_gen_callN(info, ret, NULL);
 +}
 +
 +void tcg_gen_call1(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1)
 +{
 +    tcg_gen_callN(info, ret, &t1);
 +}
 +
 +void tcg_gen_call2(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1, TCGTemp *t2)
 +{
 +    TCGTemp *args[2] = { t1, t2 };
 +    tcg_gen_callN(info, ret, args);
 +}
 +
 +void tcg_gen_call3(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
 +                   TCGTemp *t2, TCGTemp *t3)
 +{
 +    TCGTemp *args[3] = { t1, t2, t3 };
 +    tcg_gen_callN(info, ret, args);
 +}
 +
 +void tcg_gen_call4(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
 +                   TCGTemp *t2, TCGTemp *t3, TCGTemp *t4)
 +{
 +    TCGTemp *args[4] = { t1, t2, t3, t4 };
 +    tcg_gen_callN(info, ret, args);
 +}
 +
 +void tcg_gen_call5(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
 +                   TCGTemp *t2, TCGTemp *t3, TCGTemp *t4, TCGTemp *t5)
 +{
 +    TCGTemp *args[5] = { t1, t2, t3, t4, t5 };
 +    tcg_gen_callN(info, ret, args);
 +}
 +
 +void tcg_gen_call6(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1, TCGTemp *t2,
 +                   TCGTemp *t3, TCGTemp *t4, TCGTemp *t5, TCGTemp *t6)
 +{
 +    TCGTemp *args[6] = { t1, t2, t3, t4, t5, t6 };
 +    tcg_gen_callN(info, ret, args);
 +}
 +
 +void tcg_gen_call7(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
 +                   TCGTemp *t2, TCGTemp *t3, TCGTemp *t4,
 +                   TCGTemp *t5, TCGTemp *t6, TCGTemp *t7)
 +{
 +    TCGTemp *args[7] = { t1, t2, t3, t4, t5, t6, t7 };
 +    tcg_gen_callN(info, ret, args);
 +}
 +
  static void tcg_reg_alloc_start(TCGContext *s)
  {
      int i, n;
 --
-.34.1
+.43.0

-[PULL 23/52] tcg: Split helper-gen.h
+[PULL 10/72] tcg/optimize: Introduce const value accessors for TempOptInfo
-Create helper-gen-common.h without the target specific portion.
+Introduce ti_is_const, ti_const_val, ti_is_const_val.
 Use that in tcg-op-common.h.  Reorg headers in target/arm to
 ensure that helper-gen.h is included before helper-info.c.inc.
 All other targets are already correct in this regard.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- MAINTAINERS                      |   1 +
+ tcg/optimize.c | 20 +++++++++++++++++---
- include/exec/helper-gen-common.h |  18 ++++++
+file changed, 17 insertions(+), 3 deletions(-)
  include/exec/helper-gen.h        | 101 ++----------------------------
  include/tcg/tcg-op-common.h      |   2 +-
  include/exec/helper-gen.h.inc    | 102 +++++++++++++++++++++++++++++++
  target/arm/tcg/translate.c       |   8 +--
 files changed, 129 insertions(+), 103 deletions(-)
  create mode 100644 include/exec/helper-gen-common.h
  create mode 100644 include/exec/helper-gen.h.inc
-diff --git a/MAINTAINERS b/MAINTAINERS
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/tcg/optimize.c
-+++ b/MAINTAINERS
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ F: include/exec/exec-all.h
+@@ -XXX,XX +XXX,XX @@ static inline TempOptInfo *arg_info(TCGArg arg)
- F: include/exec/tb-flush.h
+     return ts_info(arg_temp(arg));
- F: include/exec/target_long.h
+ }
- F: include/exec/helper*.h
-+F: include/exec/helper*.h.inc
++static inline bool ti_is_const(TempOptInfo *ti)
- F: include/exec/helper-info.c.inc
++{
- F: include/sysemu/cpus.h
++    return ti->is_const;
  F: include/sysemu/tcg.h
 diff --git a/include/exec/helper-gen-common.h b/include/exec/helper-gen-common.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/helper-gen-common.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Helper file for declaring TCG helper functions.
 + * This one expands generation functions for tcg opcodes.
 + */
 +
 +#ifndef HELPER_GEN_COMMON_H
 +#define HELPER_GEN_COMMON_H
 +
 +#define HELPER_H "accel/tcg/tcg-runtime.h"
 +#include "exec/helper-gen.h.inc"
 +#undef  HELPER_H
 +
 +#define HELPER_H "accel/tcg/plugin-helpers.h"
 +#include "exec/helper-gen.h.inc"
 +#undef  HELPER_H
 +
 +#endif /* HELPER_GEN_COMMON_H */
 diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-gen.h
 +++ b/include/exec/helper-gen.h
@@ -XXX,XX +XXX,XX @@
  /*
   * Helper file for declaring TCG helper functions.
   * This one expands generation functions for tcg opcodes.
 - * Define HELPER_H for the header file to be expanded,
 - * and static inline to change from global file scope.
   */
  #ifndef HELPER_GEN_H
  #define HELPER_GEN_H
 -#include "tcg/tcg.h"
 -#include "tcg/helper-info.h"
 -#include "exec/helper-head.h"
 +#include "exec/helper-gen-common.h"
 -#define DEF_HELPER_FLAGS_0(name, flags, ret)                            \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret))        \
 -{                                                                       \
 -    tcg_gen_call0(&glue(helper_info_, name), dh_retvar(ret));           \
 -}
 -
 -#define DEF_HELPER_FLAGS_1(name, flags, ret, t1)                        \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1))                                                 \
 -{                                                                       \
 -    tcg_gen_call1(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1));                                       \
 -}
 -
 -#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2)                    \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2))                             \
 -{                                                                       \
 -    tcg_gen_call2(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1), dh_arg(t2, 2));                        \
 -}
 -
 -#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3)                \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3))         \
 -{                                                                       \
 -    tcg_gen_call3(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3));         \
 -}
 -
 -#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4)            \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2),                             \
 -    dh_arg_decl(t3, 3), dh_arg_decl(t4, 4))                             \
 -{                                                                       \
 -    tcg_gen_call4(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1), dh_arg(t2, 2),                         \
 -                  dh_arg(t3, 3), dh_arg(t4, 4));                        \
 -}
 -
 -#define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5)        \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
 -    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5))                             \
 -{                                                                       \
 -    tcg_gen_call5(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 -                  dh_arg(t4, 4), dh_arg(t5, 5));                        \
 -}
 -
 -#define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6)    \
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
 -    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6))         \
 -{                                                                       \
 -    tcg_gen_call6(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 -                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6));         \
 -}
 -
 -#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
 -extern TCGHelperInfo glue(helper_info_, name);                          \
 -static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
 -    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
 -    dh_arg_decl(t7, 7))                                                 \
 -{                                                                       \
 -    tcg_gen_call7(&glue(helper_info_, name), dh_retvar(ret),            \
 -                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 -                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),          \
 -                  dh_arg(t7, 7));                                       \
 -}
 -
 -#include "helper.h"
 -#include "accel/tcg/tcg-runtime.h"
 -#include "accel/tcg/plugin-helpers.h"
 -
 -#undef DEF_HELPER_FLAGS_0
 -#undef DEF_HELPER_FLAGS_1
 -#undef DEF_HELPER_FLAGS_2
 -#undef DEF_HELPER_FLAGS_3
 -#undef DEF_HELPER_FLAGS_4
 -#undef DEF_HELPER_FLAGS_5
 -#undef DEF_HELPER_FLAGS_6
 -#undef DEF_HELPER_FLAGS_7
 +#define HELPER_H "helper.h"
 +#include "exec/helper-gen.h.inc"
 +#undef  HELPER_H
  #endif /* HELPER_GEN_H */
 diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg-op-common.h
 +++ b/include/tcg/tcg-op-common.h
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg.h"
  #include "exec/helper-proto.h"
 -#include "exec/helper-gen.h"
 +#include "exec/helper-gen-common.h"
  /* Basic output routines.  Not for general consumption.  */
 diff --git a/include/exec/helper-gen.h.inc b/include/exec/helper-gen.h.inc
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/helper-gen.h.inc
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Helper file for declaring TCG helper functions.
 + * This one expands generation functions for tcg opcodes.
 + * Define HELPER_H for the header file to be expanded,
 + * and static inline to change from global file scope.
 + */
 +
 +#include "tcg/tcg.h"
 +#include "tcg/helper-info.h"
 +#include "exec/helper-head.h"
 +
 +#define DEF_HELPER_FLAGS_0(name, flags, ret)                            \
 +extern TCGHelperInfo glue(helper_info_, name);                          \
 +static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret))        \
 +{                                                                       \
 +    tcg_gen_call0(&glue(helper_info_, name), dh_retvar(ret));           \
 +}
 +
-+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1)                        \
++static inline uint64_t ti_const_val(TempOptInfo *ti)
-+extern TCGHelperInfo glue(helper_info_, name);                          \
++{
-+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
++    return ti->val;
 +    dh_arg_decl(t1, 1))                                                 \
 +{                                                                       \
 +    tcg_gen_call1(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1));                                       \
 +}
 +
-+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2)                    \
++static inline bool ti_is_const_val(TempOptInfo *ti, uint64_t val)
-+extern TCGHelperInfo glue(helper_info_, name);                          \
++{
-+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
++    return ti_is_const(ti) && ti_const_val(ti) == val;
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2))                             \
 +{                                                                       \
 +    tcg_gen_call2(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2));                        \
 +}
 +
-+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3)                \
+ static inline bool ts_is_const(TCGTemp *ts)
-+extern TCGHelperInfo glue(helper_info_, name);                          \
+ {
-+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+-    return ts_info(ts)->is_const;
-+    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3))         \
++    return ti_is_const(ts_info(ts));
-+{                                                                       \
+ }
-+    tcg_gen_call3(&glue(helper_info_, name), dh_retvar(ret),            \
-+                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3));         \
+ static inline bool ts_is_const_val(TCGTemp *ts, uint64_t val)
-+}
+ {
-+
+-    TempOptInfo *ti = ts_info(ts);
-+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4)            \
+-    return ti->is_const && ti->val == val;
-+extern TCGHelperInfo glue(helper_info_, name);                          \
++    return ti_is_const_val(ts_info(ts), val);
-+static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
+ }
-+    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2),                             \
-+    dh_arg_decl(t3, 3), dh_arg_decl(t4, 4))                             \
+ static inline bool arg_is_const(TCGArg arg)
 +{                                                                       \
 +    tcg_gen_call4(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2),                         \
 +                  dh_arg(t3, 3), dh_arg(t4, 4));                        \
 +}
 +
 +#define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5)        \
 +extern TCGHelperInfo glue(helper_info_, name);                          \
 +static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
 +    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5))                             \
 +{                                                                       \
 +    tcg_gen_call5(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 +                  dh_arg(t4, 4), dh_arg(t5, 5));                        \
 +}
 +
 +#define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6)    \
 +extern TCGHelperInfo glue(helper_info_, name);                          \
 +static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
 +    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6))         \
 +{                                                                       \
 +    tcg_gen_call6(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 +                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6));         \
 +}
 +
 +#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
 +extern TCGHelperInfo glue(helper_info_, name);                          \
 +static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
 +    dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
 +    dh_arg_decl(t7, 7))                                                 \
 +{                                                                       \
 +    tcg_gen_call7(&glue(helper_info_, name), dh_retvar(ret),            \
 +                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
 +                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),          \
 +                  dh_arg(t7, 7));                                       \
 +}
 +
 +#include HELPER_H
 +
 +#undef DEF_HELPER_FLAGS_0
 +#undef DEF_HELPER_FLAGS_1
 +#undef DEF_HELPER_FLAGS_2
 +#undef DEF_HELPER_FLAGS_3
 +#undef DEF_HELPER_FLAGS_4
 +#undef DEF_HELPER_FLAGS_5
 +#undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "semihosting/semihost.h"
  #include "exec/log.h"
  #include "cpregs.h"
 +#include "translate.h"
 +#include "translate-a32.h"
 +#include "exec/gen-icount.h"
  #define HELPER_H "helper.h"
  #include "exec/helper-info.c.inc"
@@ -XXX,XX +XXX,XX @@
  #define ENABLE_ARCH_7     arm_dc_feature(s, ARM_FEATURE_V7)
  #define ENABLE_ARCH_8     arm_dc_feature(s, ARM_FEATURE_V8)
 -#include "translate.h"
 -#include "translate-a32.h"
 -
  /* These are TCG temporaries used only by the legacy iwMMXt decoder */
  static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
  /* These are TCG globals which alias CPUARMState fields */
@@ -XXX,XX +XXX,XX @@ TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
  TCGv_i64 cpu_exclusive_addr;
  TCGv_i64 cpu_exclusive_val;
 -#include "exec/gen-icount.h"
 -
  static const char * const regnames[] =
      { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
 --
-.34.1
+.43.0

-[PULL 33/52] tcg: Spit out exec/translation-block.h
+[PULL 11/72] tcg/optimize: Use fold_masks_zs in fold_and
-This is all that is required by tcg/ from exec-all.h.
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
 Sink mask computation below fold_affected_mask early exit.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h          | 132 +--------------------------
+ tcg/optimize.c | 30 ++++++++++++++++--------------
- include/exec/translation-block.h | 149 +++++++++++++++++++++++++++++++
+file changed, 16 insertions(+), 14 deletions(-)
  tcg/tcg-op-ldst.c                |   2 +-
 files changed, 151 insertions(+), 132 deletions(-)
  create mode 100644 include/exec/translation-block.h
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2(OptContext *ctx, TCGOp *op)
- #ifdef CONFIG_TCG
- #include "exec/cpu_ldst.h"
+ static bool fold_and(OptContext *ctx, TCGOp *op)
- #endif
+ {
--#include "qemu/interval-tree.h"
+-    uint64_t z1, z2;
-+#include "exec/translation-block.h"
++    uint64_t z1, z2, z_mask, s_mask;
- #include "qemu/clang-tsa.h"
++    TempOptInfo *t1, *t2;
--/* Page tracking code uses ram addresses in system mode, and virtual
+     if (fold_const2_commutative(ctx, op) ||
--   addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
+         fold_xi_to_i(ctx, op, 0) ||
--   type.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
--#if defined(CONFIG_USER_ONLY)
+         return true;
--typedef vaddr tb_page_addr_t;
+     }
--#define TB_PAGE_ADDR_FMT "%" VADDR_PRIx
--#else
+-    z1 = arg_info(op->args[1])->z_mask;
--typedef ram_addr_t tb_page_addr_t;
+-    z2 = arg_info(op->args[2])->z_mask;
--#define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
+-    ctx->z_mask = z1 & z2;
 -#endif
 -
  /**
   * cpu_unwind_state_data:
   * @cpu: the cpu context
@@ -XXX,XX +XXX,XX @@ int probe_access_full(CPUArchState *env, target_ulong addr, int size,
                        CPUTLBEntryFull **pfull, uintptr_t retaddr);
  #endif
 -#define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
 -
  /* Estimated block size for TB allocation.  */
  /* ??? The following is based on a 2015 survey of x86_64 host output.
     Better would seem to be some sort of dynamically sized TB array,
@@ -XXX,XX +XXX,XX @@ int probe_access_full(CPUArchState *env, target_ulong addr, int size,
  #define CODE_GEN_AVG_BLOCK_SIZE 150
  #endif
 -/*
 - * Translation Cache-related fields of a TB.
 - * This struct exists just for convenience; we keep track of TB's in a binary
 - * search tree, and the only fields needed to compare TB's in the tree are
 - * @ptr and @size.
 - * Note: the address of search data can be obtained by adding @size to @ptr.
 - */
 -struct tb_tc {
 -    const void *ptr;    /* pointer to the translated code */
 -    size_t size;
 -};
 -
 -struct TranslationBlock {
 -    /*
 -     * Guest PC corresponding to this block.  This must be the true
 -     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
 -     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
 -     * privilege, must store those bits elsewhere.
 -     *
 -     * If CF_PCREL, the opcodes for the TranslationBlock are written
 -     * such that the TB is associated only with the physical page and
 -     * may be run in any virtual address context.  In this case, PC
 -     * must always be taken from ENV in a target-specific manner.
 -     * Unwind information is taken as offsets from the page, to be
 -     * deposited into the "current" PC.
 -     */
 -    vaddr pc;
 -
 -    /*
--     * Target-specific data associated with the TranslationBlock, e.g.:
+-     * Sign repetitions are perforce all identical, whether they are 1 or 0.
--     * x86: the original user, the Code Segment virtual base,
+-     * Bitwise operations preserve the relative quantity of the repetitions.
 -     * arm: an extension of tb->flags,
 -     * s390x: instruction data for EXECUTE,
 -     * sparc: the next pc of the instruction queue (for delay slots).
 -     */
--    uint64_t cs_base;
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
--
+-                & arg_info(op->args[2])->s_mask;
--    uint32_t flags; /* flags defining in which context the code was generated */
++    t1 = arg_info(op->args[1]);
--    uint32_t cflags;    /* compile flags */
++    t2 = arg_info(op->args[2]);
--
++    z1 = t1->z_mask;
--/* Note that TCG_MAX_INSNS is 512; we validate this match elsewhere. */
++    z2 = t2->z_mask;
--#define CF_COUNT_MASK    0x000001ff
--#define CF_NO_GOTO_TB    0x00000200 /* Do not chain with goto_tb */
+     /*
--#define CF_NO_GOTO_PTR   0x00000400 /* Do not chain with goto_ptr */
+      * Known-zeros does not imply known-ones.  Therefore unless
--#define CF_SINGLE_STEP   0x00000800 /* gdbstub single-step in effect */
+      * arg2 is constant, we can't infer affected bits from it.
--#define CF_LAST_IO       0x00008000 /* Last insn may be an IO access.  */
+      */
--#define CF_MEMI_ONLY     0x00010000 /* Only instrument memory ops */
+-    if (arg_is_const(op->args[2]) &&
--#define CF_USE_ICOUNT    0x00020000
+-        fold_affected_mask(ctx, op, z1 & ~z2)) {
--#define CF_INVALID       0x00040000 /* TB is stale. Set with @jmp_lock held */
++    if (ti_is_const(t2) && fold_affected_mask(ctx, op, z1 & ~z2)) {
--#define CF_PARALLEL      0x00080000 /* Generate code for a parallel context */
+         return true;
--#define CF_NOIRQ         0x00100000 /* Generate an uninterruptible TB */
+     }
--#define CF_PCREL         0x00200000 /* Opcodes in TB are PC-relative */
--#define CF_CLUSTER_MASK  0xff000000 /* Top 8 bits are cluster ID */
+-    return fold_masks(ctx, op);
--#define CF_CLUSTER_SHIFT 24
++    z_mask = z1 & z2;
 -
 -    /*
 -     * Above fields used for comparing
 -     */
 -
 -    /* size of target code for this block (1 <= size <= TARGET_PAGE_SIZE) */
 -    uint16_t size;
 -    uint16_t icount;
 -
 -    struct tb_tc tc;
 -
 -    /*
 -     * Track tb_page_addr_t intervals that intersect this TB.
 -     * For user-only, the virtual addresses are always contiguous,
 -     * and we use a unified interval tree.  For system, we use a
 -     * linked list headed in each PageDesc.  Within the list, the lsb
 -     * of the previous pointer tells the index of page_next[], and the
 -     * list is protected by the PageDesc lock(s).
 -     */
 -#ifdef CONFIG_USER_ONLY
 -    IntervalTreeNode itree;
 -#else
 -    uintptr_t page_next[2];
 -    tb_page_addr_t page_addr[2];
 -#endif
 -
 -    /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
 -    QemuSpin jmp_lock;
 -
 -    /* The following data are used to directly call another TB from
 -     * the code of this one. This can be done either by emitting direct or
 -     * indirect native jump instructions. These jumps are reset so that the TB
 -     * just continues its execution. The TB can be linked to another one by
 -     * setting one of the jump targets (or patching the jump instruction). Only
 -     * two of such jumps are supported.
 -     */
 -#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
 -    uint16_t jmp_reset_offset[2]; /* offset of original jump target */
 -    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
 -    uintptr_t jmp_target_addr[2]; /* target address */
 -
 -    /*
 -     * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
 -     * Each TB can have two outgoing jumps, and therefore can participate
 -     * in two lists. The list entries are kept in jmp_list_next[2]. The least
 -     * significant bit (LSB) of the pointers in these lists is used to encode
 -     * which of the two list entries is to be used in the pointed TB.
 -     *
 -     * List traversals are protected by jmp_lock. The destination TB of each
 -     * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock
 -     * can be acquired from any origin TB.
 -     *
 -     * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is
 -     * being invalidated, so that no further outgoing jumps from it can be set.
 -     *
 -     * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained
 -     * to a destination TB that has CF_INVALID set.
 -     */
 -    uintptr_t jmp_list_head;
 -    uintptr_t jmp_list_next[2];
 -    uintptr_t jmp_dest[2];
 -};
 -
  /* Hide the qatomic_read to make code a little easier on the eyes */
  static inline uint32_t tb_cflags(const TranslationBlock *tb)
  {
 diff --git a/include/exec/translation-block.h b/include/exec/translation-block.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/translation-block.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: LGPL-2.1-or-later */
 +/*
 + * Definition of TranslationBlock.
 + *  Copyright (c) 2003 Fabrice Bellard
 + */
 +
 +#ifndef EXEC_TRANSLATION_BLOCK_H
 +#define EXEC_TRANSLATION_BLOCK_H
 +
 +#include "qemu/atomic.h"
 +#include "qemu/thread.h"
 +#include "qemu/interval-tree.h"
 +#include "exec/cpu-common.h"
 +#include "exec/target_page.h"
 +
 +/*
 + * Page tracking code uses ram addresses in system mode, and virtual
 + * addresses in userspace mode.  Define tb_page_addr_t to be an
 + * appropriate type.
 + */
 +#if defined(CONFIG_USER_ONLY)
 +typedef vaddr tb_page_addr_t;
 +#define TB_PAGE_ADDR_FMT "%" VADDR_PRIx
 +#else
 +typedef ram_addr_t tb_page_addr_t;
 +#define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
 +#endif
 +
 +/*
 + * Translation Cache-related fields of a TB.
 + * This struct exists just for convenience; we keep track of TB's in a binary
 + * search tree, and the only fields needed to compare TB's in the tree are
 + * @ptr and @size.
 + * Note: the address of search data can be obtained by adding @size to @ptr.
 + */
 +struct tb_tc {
 +    const void *ptr;    /* pointer to the translated code */
 +    size_t size;
 +};
 +
 +struct TranslationBlock {
 +    /*
 +     * Guest PC corresponding to this block.  This must be the true
 +     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
 +     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
 +     * privilege, must store those bits elsewhere.
 +     *
 +     * If CF_PCREL, the opcodes for the TranslationBlock are written
 +     * such that the TB is associated only with the physical page and
 +     * may be run in any virtual address context.  In this case, PC
 +     * must always be taken from ENV in a target-specific manner.
 +     * Unwind information is taken as offsets from the page, to be
 +     * deposited into the "current" PC.
 +     */
 +    vaddr pc;
 +
 +    /*
-+     * Target-specific data associated with the TranslationBlock, e.g.:
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
-+     * x86: the original user, the Code Segment virtual base,
++     * Bitwise operations preserve the relative quantity of the repetitions.
 +     * arm: an extension of tb->flags,
 +     * s390x: instruction data for EXECUTE,
 +     * sparc: the next pc of the instruction queue (for delay slots).
 +     */
-+    uint64_t cs_base;
++    s_mask = t1->s_mask & t2->s_mask;
 +
-+    uint32_t flags; /* flags defining in which context the code was generated */
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
-+    uint32_t cflags;    /* compile flags */
+ }
-+
-+/* Note that TCG_MAX_INSNS is 512; we validate this match elsewhere. */
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
 +#define CF_COUNT_MASK    0x000001ff
 +#define CF_NO_GOTO_TB    0x00000200 /* Do not chain with goto_tb */
 +#define CF_NO_GOTO_PTR   0x00000400 /* Do not chain with goto_ptr */
 +#define CF_SINGLE_STEP   0x00000800 /* gdbstub single-step in effect */
 +#define CF_LAST_IO       0x00008000 /* Last insn may be an IO access.  */
 +#define CF_MEMI_ONLY     0x00010000 /* Only instrument memory ops */
 +#define CF_USE_ICOUNT    0x00020000
 +#define CF_INVALID       0x00040000 /* TB is stale. Set with @jmp_lock held */
 +#define CF_PARALLEL      0x00080000 /* Generate code for a parallel context */
 +#define CF_NOIRQ         0x00100000 /* Generate an uninterruptible TB */
 +#define CF_PCREL         0x00200000 /* Opcodes in TB are PC-relative */
 +#define CF_CLUSTER_MASK  0xff000000 /* Top 8 bits are cluster ID */
 +#define CF_CLUSTER_SHIFT 24
 +
 +    /*
 +     * Above fields used for comparing
 +     */
 +
 +    /* size of target code for this block (1 <= size <= TARGET_PAGE_SIZE) */
 +    uint16_t size;
 +    uint16_t icount;
 +
 +    struct tb_tc tc;
 +
 +    /*
 +     * Track tb_page_addr_t intervals that intersect this TB.
 +     * For user-only, the virtual addresses are always contiguous,
 +     * and we use a unified interval tree.  For system, we use a
 +     * linked list headed in each PageDesc.  Within the list, the lsb
 +     * of the previous pointer tells the index of page_next[], and the
 +     * list is protected by the PageDesc lock(s).
 +     */
 +#ifdef CONFIG_USER_ONLY
 +    IntervalTreeNode itree;
 +#else
 +    uintptr_t page_next[2];
 +    tb_page_addr_t page_addr[2];
 +#endif
 +
 +    /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
 +    QemuSpin jmp_lock;
 +
 +    /* The following data are used to directly call another TB from
 +     * the code of this one. This can be done either by emitting direct or
 +     * indirect native jump instructions. These jumps are reset so that the TB
 +     * just continues its execution. The TB can be linked to another one by
 +     * setting one of the jump targets (or patching the jump instruction). Only
 +     * two of such jumps are supported.
 +     */
 +#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
 +    uint16_t jmp_reset_offset[2]; /* offset of original jump target */
 +    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
 +    uintptr_t jmp_target_addr[2]; /* target address */
 +
 +    /*
 +     * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
 +     * Each TB can have two outgoing jumps, and therefore can participate
 +     * in two lists. The list entries are kept in jmp_list_next[2]. The least
 +     * significant bit (LSB) of the pointers in these lists is used to encode
 +     * which of the two list entries is to be used in the pointed TB.
 +     *
 +     * List traversals are protected by jmp_lock. The destination TB of each
 +     * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock
 +     * can be acquired from any origin TB.
 +     *
 +     * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is
 +     * being invalidated, so that no further outgoing jumps from it can be set.
 +     *
 +     * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained
 +     * to a destination TB that has CF_INVALID set.
 +     */
 +    uintptr_t jmp_list_head;
 +    uintptr_t jmp_list_next[2];
 +    uintptr_t jmp_dest[2];
 +};
 +
 +/* The alignment given to TranslationBlock during allocation. */
 +#define CODE_GEN_ALIGN  16
 +
 +#endif /* EXEC_TRANSLATION_BLOCK_H */
 diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-ldst.c
 +++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "exec/exec-all.h"
  #include "tcg/tcg.h"
  #include "tcg/tcg-temp-internal.h"
  #include "tcg/tcg-op-common.h"
  #include "tcg/tcg-mo.h"
 +#include "exec/translation-block.h"
  #include "exec/plugin-gen.h"
  #include "tcg-internal.h"
 --
-.34.1
+.43.0

-New patch
+[PULL 12/72] tcg/optimize: Use fold_masks_zs in fold_andc
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Avoid double inversion of the value of second const operand.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 21 +++++++++++----------
+file changed, 11 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z1;
++    uint64_t z_mask, s_mask;
++    TempOptInfo *t1, *t2;
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+         return true;
+     }
+-    z1 = arg_info(op->args[1])->z_mask;
++    t1 = arg_info(op->args[1]);
++    t2 = arg_info(op->args[2]);
++    z_mask = t1->z_mask;
+     /*
+      * Known-zeros does not imply known-ones.  Therefore unless
+      * arg2 is constant, we can't infer anything from it.
+      */
+-    if (arg_is_const(op->args[2])) {
+-        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+-        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
++    if (ti_is_const(t2)) {
++        uint64_t v2 = ti_const_val(t2);
++        if (fold_affected_mask(ctx, op, z_mask & v2)) {
+             return true;
+         }
+-        z1 &= z2;
++        z_mask &= ~v2;
+     }
+-    ctx->z_mask = z1;
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return fold_masks(ctx, op);
++    s_mask = t1->s_mask & t2->s_mask;
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 13/72] tcg/optimize: Use fold_masks_zs in fold_bswap
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Always set s_mask along the BSWAP_OS path, since the result is
+being explicitly sign-extended.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 21 ++++++++++-----------
+file changed, 10 insertions(+), 11 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask, s_mask, sign;
++    TempOptInfo *t1 = arg_info(op->args[1]);
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t = arg_info(op->args[1])->val;
+-
+-        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    if (ti_is_const(t1)) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
++                                do_constant_folding(op->opc, ctx->type,
++                                                    ti_const_val(t1),
++                                                    op->args[2]));
+     }
+-    z_mask = arg_info(op->args[1])->z_mask;
+-
++    z_mask = t1->z_mask;
+     switch (op->opc) {
+     case INDEX_op_bswap16_i32:
+     case INDEX_op_bswap16_i64:
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+         /* If the sign bit may be 1, force all the bits above to 1. */
+         if (z_mask & sign) {
+             z_mask |= sign;
+-            s_mask = sign << 1;
+         }
++        /* The value and therefore s_mask is explicitly sign-extended. */
++        s_mask = sign;
+         break;
+     default:
+         /* The high bits are undefined: force all bits above the sign to 1. */
+         z_mask |= sign << 1;
+         break;
+     }
+-    ctx->z_mask = z_mask;
+-    ctx->s_mask = s_mask;
+-    return fold_masks(ctx, op);
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 14/72] tcg/optimize: Use fold_masks_zs in fold_count_zeros
+Avoid the use of the OptContext slots. Find TempOptInfo once.
+Compute s_mask from the union of the maximum count and the
+op2 fallback for op1 being zero.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 15 ++++++++++-----
+file changed, 10 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z_mask;
++    uint64_t z_mask, s_mask;
++    TempOptInfo *t1 = arg_info(op->args[1]);
++    TempOptInfo *t2 = arg_info(op->args[2]);
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t = arg_info(op->args[1])->val;
++    if (ti_is_const(t1)) {
++        uint64_t t = ti_const_val(t1);
+         if (t != 0) {
+             t = do_constant_folding(op->opc, ctx->type, t, 0);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+     default:
+         g_assert_not_reached();
+     }
+-    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+-    return false;
++    s_mask = ~z_mask;
++    z_mask |= t2->z_mask;
++    s_mask &= t2->s_mask;
++
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 15/72] tcg/optimize: Use fold_masks_z in fold_ctpop
+Add fold_masks_z as a trivial wrapper around fold_masks_zs.
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 13 ++++++++++---
+file changed, 10 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
+     return true;
+ }
++static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
++{
++    return fold_masks_zs(ctx, op, z_mask, 0);
++}
++
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
+ {
+     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask;
++
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+     switch (ctx->type) {
+     case TCG_TYPE_I32:
+-        ctx->z_mask = 32 | 31;
++        z_mask = 32 | 31;
+         break;
+     case TCG_TYPE_I64:
+-        ctx->z_mask = 64 | 63;
++        z_mask = 64 | 63;
+         break;
+     default:
+         g_assert_not_reached();
+     }
+-    return false;
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 01/52] tcg/ppc: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+[PULL 16/72] tcg/optimize: Use fold_and and fold_masks_z in fold_deposit
-All uses replaced with TCGContext.addr_type.
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
 When we fold to and, use fold_and.
-Reviewed-by: Anton Johansson <anjo@rev.ng>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/ppc/tcg-target.c.inc | 21 +++++++++++----------
+ tcg/optimize.c | 35 +++++++++++++++++------------------
-file changed, 11 insertions(+), 10 deletions(-)
+file changed, 17 insertions(+), 18 deletions(-)
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/ppc/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/ppc/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-                                            TCGReg addrlo, TCGReg addrhi,
-                                            MemOpIdx oi, bool is_ld)
+ static bool fold_deposit(OptContext *ctx, TCGOp *op)
  {
-+    TCGType addr_type = s->addr_type;
++    TempOptInfo *t1 = arg_info(op->args[1]);
-     TCGLabelQemuLdst *ldst = NULL;
++    TempOptInfo *t2 = arg_info(op->args[2]);
-     MemOp opc = get_memop(oi);
++    int ofs = op->args[3];
-     MemOp a_bits, s_bits;
++    int len = op->args[4];
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+     TCGOpcode and_opc;
-     tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0));
++    uint64_t z_mask;
-     /* Load the (low part) TLB comparator into TMP2.  */
+-    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
--    if (cmp_off == 0 && TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
+-        uint64_t t1 = arg_info(op->args[1])->val;
--        uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32
+-        uint64_t t2 = arg_info(op->args[2])->val;
-+    if (cmp_off == 0
+-
-+        && (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32)) {
+-        t1 = deposit64(t1, op->args[3], op->args[4], t2);
-+        uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
-                         ? LWZUX : LDUX);
++    if (ti_is_const(t1) && ti_is_const(t2)) {
-         tcg_out32(s, lxu | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
-     } else {
++                                deposit64(ti_const_val(t1), ofs, len,
-         tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
++                                          ti_const_val(t2)));
 -        if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 +        if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) {
              tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2,
                         TCG_REG_TMP1, cmp_off + 4 * HOST_BIG_ENDIAN);
          } else {
 -            tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
 +            tcg_out_ld(s, addr_type, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
          }
      }
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+     switch (ctx->type) {
-      * Load the TLB addend for use on the fast path.
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
       * Do this asap to minimize any load use delay.
       */
 -    if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
 +    if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) {
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
                     offsetof(CPUTLBEntry, addend));
      }
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
-         }
+     /* Inserting a value into zero at offset 0. */
+-    if (arg_is_const_val(op->args[1], 0) && op->args[3] == 0) {
-         /* Mask the address for the requested alignment.  */
+-        uint64_t mask = MAKE_64BIT_MASK(0, op->args[4]);
--        if (TARGET_LONG_BITS == 32) {
++    if (ti_is_const_val(t1, 0) && ofs == 0) {
-+        if (addr_type == TCG_TYPE_I32) {
++        uint64_t mask = MAKE_64BIT_MASK(0, len);
-             tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0,
-                         (32 - a_bits) & 31, 31 - s->page_bits);
+         op->opc = and_opc;
-         } else if (a_bits == 0) {
+         op->args[1] = op->args[2];
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+         op->args[2] = arg_new_constant(ctx, mask);
-         }
+-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
 -        return false;
 +        return fold_and(ctx, op);
      }
--    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+     /* Inserting zero into a value. */
-+    if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) {
+-    if (arg_is_const_val(op->args[2], 0)) {
-         /* Low part comparison into cr7. */
+-        uint64_t mask = deposit64(-1, op->args[3], op->args[4], 0);
-         tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2,
++    if (ti_is_const_val(t2, 0)) {
-, 7, TCG_TYPE_I32);
++        uint64_t mask = deposit64(-1, ofs, len, 0);
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
-         tcg_out32(s, CRAND | BT(7, CR_EQ) | BA(6, CR_EQ) | BB(7, CR_EQ));
+         op->opc = and_opc;
-     } else {
+         op->args[2] = arg_new_constant(ctx, mask);
-         /* Full comparison into cr7. */
+-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
--        tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2,
+-        return false;
--                    0, 7, TCG_TYPE_TL);
++        return fold_and(ctx, op);
 +        tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2, 0, 7, addr_type);
      }
-     /* Load a pointer into the current opcode w/conditional branch-link. */
+-    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+-                            op->args[3], op->args[4],
-     h->base = guest_base ? TCG_GUEST_BASE_REG : 0;
+-                            arg_info(op->args[2])->z_mask);
- #endif
+-    return false;
++    z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
--    if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
++    return fold_masks_z(ctx, op, z_mask);
-+    if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) {
+ }
-         /* Zero-extend the guest address for use in the host address. */
-         tcg_out_ext32u(s, TCG_REG_R0, addrlo);
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
          h->index = TCG_REG_R0;
 --
-.34.1
+.43.0

-New patch
+[PULL 17/72] tcg/optimize: Compute sign mask in fold_deposit
+The input which overlaps the sign bit of the output can
+have its input s_mask propagated to the output s_mask.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 14 ++++++++++++--
+file changed, 12 insertions(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+     TempOptInfo *t2 = arg_info(op->args[2]);
+     int ofs = op->args[3];
+     int len = op->args[4];
++    int width;
+     TCGOpcode and_opc;
+-    uint64_t z_mask;
++    uint64_t z_mask, s_mask;
+     if (ti_is_const(t1) && ti_is_const(t2)) {
+         return tcg_opt_gen_movi(ctx, op, op->args[0],
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+     switch (ctx->type) {
+     case TCG_TYPE_I32:
+         and_opc = INDEX_op_and_i32;
++        width = 32;
+         break;
+     case TCG_TYPE_I64:
+         and_opc = INDEX_op_and_i64;
++        width = 64;
+         break;
+     default:
+         g_assert_not_reached();
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+         return fold_and(ctx, op);
+     }
++    /* The s_mask from the top portion of the deposit is still valid. */
++    if (ofs + len == width) {
++        s_mask = t2->s_mask << ofs;
++    } else {
++        s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len);
++    }
++
+     z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
+-    return fold_masks_z(ctx, op, z_mask);
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 18/72] tcg/optimize: Use finish_folding in fold_divide
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
+         fold_xi_to_x(ctx, op, 1)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 19/72] tcg/optimize: Use finish_folding in fold_dup, fold_dup2
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup(OptContext *ctx, TCGOp *op)
+         t = dup_const(TCGOP_VECE(op), t);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_dup2(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
+         op->opc = INDEX_op_dup_vec;
+         TCGOP_VECE(op) = MO_32;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 28/52] tcg: Move TLB_FLAGS_MASK check out of get_alignment_bits
+[PULL 20/72] tcg/optimize: Use fold_masks_s in fold_eqv
-The replacement isn't ideal, as the raw count of bits
+Add fold_masks_s as a trivial wrapper around fold_masks_zs.
-is not easily synced with exec/cpu-all.h, but it does
+Avoid the use of the OptContext slots.
 remove from tcg.h the target dependency on TARGET_PAGE_BITS_MIN
 which is built into TLB_FLAGS_MASK.
-Reviewed-by: Anton Johansson <anjo@rev.ng>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h |  3 +++
+ tcg/optimize.c | 13 ++++++++++---
- include/tcg/tcg.h      |  4 ----
+file changed, 10 insertions(+), 3 deletions(-)
  tcg/tcg-op-ldst.c      | 18 ++++++++++++++++--
 files changed, 19 insertions(+), 6 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
-  *
+     return fold_masks_zs(ctx, op, z_mask, 0);
   * Use TARGET_PAGE_BITS_MIN so that these bits are constant
   * when TARGET_PAGE_BITS_VARY is in effect.
 + *
 + * The count, if not the placement of these bits is known
 + * to tcg/tcg-op-ldst.c, check_max_alignment().
   */
  /* Zero if TLB entry is valid.  */
  #define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS_MIN - 1))
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned get_alignment_bits(MemOp memop)
          /* A specific alignment requirement.  */
          a = a >> MO_ASHIFT;
      }
 -#if defined(CONFIG_SOFTMMU)
 -    /* The requested alignment cannot overlap the TLB flags.  */
 -    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << a) - 1)) == 0);
 -#endif
      return a;
  }
-diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
++static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-ldst.c
 +++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg-internal.h"
 -static inline MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st)
 +static void check_max_alignment(unsigned a_bits)
 +{
-+#if defined(CONFIG_SOFTMMU)
++    return fold_masks_zs(ctx, op, -1, s_mask);
 +    /*
 +     * The requested alignment cannot overlap the TLB flags.
 +     * FIXME: Must keep the count up-to-date with "exec/cpu-all.h".
 +     */
 +    tcg_debug_assert(a_bits + 6 <= tcg_ctx->page_bits);
 +#endif
 +}
 +
-+static MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st)
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
--    /* Trigger the asserts within as early as possible.  */
+     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
-     unsigned a_bits = get_alignment_bits(op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
-+    check_max_alignment(a_bits);
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t s_mask;
 +
-     /* Prefer MO_ALIGN+MO_XX over MO_ALIGN_XX+MO_XX */
+     if (fold_const2_commutative(ctx, op) ||
-     if (a_bits == (op & MO_SIZE)) {
+         fold_xi_to_x(ctx, op, -1) ||
-         op = (op & ~MO_AMASK) | MO_ALIGN;
+         fold_xi_to_not(ctx, op, 0)) {
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_qemu_ld_i128_int(TCGv_i128 val, TCGTemp *addr,
+         return true;
-     TCGv_i64 ext_addr = NULL;
+     }
-     TCGOpcode opc;
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
-+    check_max_alignment(get_alignment_bits(memop));
+-                & arg_info(op->args[2])->s_mask;
-     tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
-     /* TODO: For now, force 32-bit hosts to use the helper. */
++           & arg_info(op->args[2])->s_mask;
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_qemu_st_i128_int(TCGv_i128 val, TCGTemp *addr,
++    return fold_masks_s(ctx, op, s_mask);
-     TCGv_i64 ext_addr = NULL;
+ }
-     TCGOpcode opc;
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
 +    check_max_alignment(get_alignment_bits(memop));
      tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
      /* TODO: For now, force 32-bit hosts to use the helper. */
 --
-.34.1
+.43.0

-New patch
+[PULL 21/72] tcg/optimize: Use fold_masks_z in fold_extract
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 15 ++++++---------
+file changed, 6 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask_old, z_mask;
++    TempOptInfo *t1 = arg_info(op->args[1]);
+     int pos = op->args[2];
+     int len = op->args[3];
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t;
+-
+-        t = arg_info(op->args[1])->val;
+-        t = extract64(t, pos, len);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    if (ti_is_const(t1)) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
++                                extract64(ti_const_val(t1), pos, len));
+     }
+-    z_mask_old = arg_info(op->args[1])->z_mask;
++    z_mask_old = t1->z_mask;
+     z_mask = extract64(z_mask_old, pos, len);
+     if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+         return true;
+     }
+-    ctx->z_mask = z_mask;
+-    return fold_masks(ctx, op);
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 22/72] tcg/optimize: Use finish_folding in fold_extract2
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+         }
+         return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 23/72] tcg/optimize: Use fold_masks_zs in fold_exts
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Explicitly sign-extend z_mask instead of doing that manually.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 29 ++++++++++++-----------------
+file changed, 12 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t s_mask_old, s_mask, z_mask, sign;
++    uint64_t s_mask_old, s_mask, z_mask;
+     bool type_change = false;
++    TempOptInfo *t1;
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+-    z_mask = arg_info(op->args[1])->z_mask;
+-    s_mask = arg_info(op->args[1])->s_mask;
++    t1 = arg_info(op->args[1]);
++    z_mask = t1->z_mask;
++    s_mask = t1->s_mask;
+     s_mask_old = s_mask;
+     switch (op->opc) {
+     CASE_OP_32_64(ext8s):
+-        sign = INT8_MIN;
+-        z_mask = (uint8_t)z_mask;
++        s_mask |= INT8_MIN;
++        z_mask = (int8_t)z_mask;
+         break;
+     CASE_OP_32_64(ext16s):
+-        sign = INT16_MIN;
+-        z_mask = (uint16_t)z_mask;
++        s_mask |= INT16_MIN;
++        z_mask = (int16_t)z_mask;
+         break;
+     case INDEX_op_ext_i32_i64:
+         type_change = true;
+         QEMU_FALLTHROUGH;
+     case INDEX_op_ext32s_i64:
+-        sign = INT32_MIN;
+-        z_mask = (uint32_t)z_mask;
++        s_mask |= INT32_MIN;
++        z_mask = (int32_t)z_mask;
+         break;
+     default:
+         g_assert_not_reached();
+     }
+-    if (z_mask & sign) {
+-        z_mask |= sign;
+-    }
+-    s_mask |= sign << 1;
+-
+-    ctx->z_mask = z_mask;
+-    ctx->s_mask = s_mask;
+     if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+         return true;
+     }
+-    return fold_masks(ctx, op);
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_extu(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 31/52] exec-all: Widen tb_page_addr_t for user-only
+[PULL 24/72] tcg/optimize: Use fold_masks_z in fold_extu
-This is a step toward making TranslationBlock agnostic
+Avoid the use of the OptContext slots.
 to the address size of the guest.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 4 ++--
+ tcg/optimize.c | 4 ++--
 file changed, 2 insertions(+), 2 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
-    addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
+         g_assert_not_reached();
-    type.  */
+     }
- #if defined(CONFIG_USER_ONLY)
--typedef abi_ulong tb_page_addr_t;
+-    ctx->z_mask = z_mask;
--#define TB_PAGE_ADDR_FMT TARGET_ABI_FMT_lx
+     if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
-+typedef vaddr tb_page_addr_t;
+         return true;
-+#define TB_PAGE_ADDR_FMT "%" VADDR_PRIx
+     }
- #else
+-    return fold_masks(ctx, op);
- typedef ram_addr_t tb_page_addr_t;
++
- #define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
++    return fold_masks_z(ctx, op, z_mask);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
 --
-.34.1
+.43.0

-New patch
+[PULL 25/72] tcg/optimize: Use fold_masks_zs in fold_movcond
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 19 +++++++++++--------
+file changed, 11 insertions(+), 8 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask, s_mask;
++    TempOptInfo *tt, *ft;
+     int i;
+     /* If true and false values are the same, eliminate the cmp. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+     }
+-    ctx->z_mask = arg_info(op->args[3])->z_mask
+-                | arg_info(op->args[4])->z_mask;
+-    ctx->s_mask = arg_info(op->args[3])->s_mask
+-                & arg_info(op->args[4])->s_mask;
++    tt = arg_info(op->args[3]);
++    ft = arg_info(op->args[4]);
++    z_mask = tt->z_mask | ft->z_mask;
++    s_mask = tt->s_mask & ft->s_mask;
+-    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-        uint64_t tv = arg_info(op->args[3])->val;
+-        uint64_t fv = arg_info(op->args[4])->val;
++    if (ti_is_const(tt) && ti_is_const(ft)) {
++        uint64_t tv = ti_const_val(tt);
++        uint64_t fv = ti_const_val(ft);
+         TCGOpcode opc, negopc = 0;
+         TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+             }
+         }
+     }
+-    return false;
++
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 26/72] tcg/optimize: Use finish_folding in fold_mul*
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 6 +++---
+file changed, 3 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
+         fold_xi_to_x(ctx, op, 1)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+         fold_xi_to_i(ctx, op, 0)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+         tcg_opt_gen_movi(ctx, op2, rh, h);
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 27/72] tcg/optimize: Use fold_masks_s in fold_nand
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t s_mask;
++
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_not(ctx, op, -1)) {
+         return true;
+     }
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
++           & arg_info(op->args[2])->s_mask;
++    return fold_masks_s(ctx, op, s_mask);
+ }
+ static bool fold_neg_no_const(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 28/72] tcg/optimize: Use fold_masks_z in fold_neg_no_const
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++-------
+file changed, 2 insertions(+), 7 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_neg_no_const(OptContext *ctx, TCGOp *op)
+ {
+     /* Set to 1 all bits to the left of the rightmost.  */
+     uint64_t z_mask = arg_info(op->args[1])->z_mask;
+-    ctx->z_mask = -(z_mask & -z_mask);
++    z_mask = -(z_mask & -z_mask);
+-    /*
+-     * Because of fold_sub_to_neg, we want to always return true,
+-     * via finish_folding.
+-     */
+-    finish_folding(ctx, op);
+-    return true;
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 29/72] tcg/optimize: Use fold_masks_s in fold_nor
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t s_mask;
++
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_not(ctx, op, 0)) {
+         return true;
+     }
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
++           & arg_info(op->args[2])->s_mask;
++    return fold_masks_s(ctx, op, s_mask);
+ }
+ static bool fold_not(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 30/72] tcg/optimize: Use fold_masks_s in fold_not
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 7 +------
+file changed, 1 insertion(+), 6 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+-
+-    ctx->s_mask = arg_info(op->args[1])->s_mask;
+-
+-    /* Because of fold_to_not, we want to always return true, via finish. */
+-    finish_folding(ctx, op);
+-    return true;
++    return fold_masks_s(ctx, op, arg_info(op->args[1])->s_mask);
+ }
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 31/72] tcg/optimize: Use fold_masks_zs in fold_or
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 13 ++++++++-----
+file changed, 8 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask, s_mask;
++    TempOptInfo *t1, *t2;
++
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_x(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+-    ctx->z_mask = arg_info(op->args[1])->z_mask
+-                | arg_info(op->args[2])->z_mask;
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return fold_masks(ctx, op);
++    t1 = arg_info(op->args[1]);
++    t2 = arg_info(op->args[2]);
++    z_mask = t1->z_mask | t2->z_mask;
++    s_mask = t1->s_mask & t2->s_mask;
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 32/72] tcg/optimize: Use fold_masks_zs in fold_orc
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t s_mask;
++
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, -1) ||
+         fold_xi_to_x(ctx, op, -1) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
+         return true;
+     }
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
++           & arg_info(op->args[2])->s_mask;
++    return fold_masks_s(ctx, op, s_mask);
+ }
+ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 47/52] plugins: Move plugin_insn_append to translator.c
+[PULL 33/72] tcg/optimize: Use fold_masks_zs in fold_qemu_ld
-This function is only used in translator.c, and uses a
+Avoid the use of the OptContext slots.
 target-specific typedef: abi_ptr.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Be careful not to call fold_masks_zs when the memory operation
 is wide enough to require multiple outputs, so split into two
 functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg.
 Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/plugin-gen.h | 22 ----------------------
+ tcg/optimize.c | 26 +++++++++++++++++++++-----
- accel/tcg/translator.c    | 21 +++++++++++++++++++++
+file changed, 21 insertions(+), 5 deletions(-)
 files changed, 21 insertions(+), 22 deletions(-)
-diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/plugin-gen.h
+--- a/tcg/optimize.c
-+++ b/include/exec/plugin-gen.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void plugin_gen_insn_end(void);
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
- void plugin_gen_disable_mem_helpers(void);
+     return fold_masks_s(ctx, op, s_mask);
  void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info);
 -static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
 -{
 -    struct qemu_plugin_insn *insn = tcg_ctx->plugin_insn;
 -    abi_ptr off;
 -
 -    if (insn == NULL) {
 -        return;
 -    }
 -    off = pc - insn->vaddr;
 -    if (off < insn->data->len) {
 -        g_byte_array_set_size(insn->data, off);
 -    } else if (off > insn->data->len) {
 -        /* we have an unexpected gap */
 -        g_assert_not_reached();
 -    }
 -
 -    insn->data = g_byte_array_append(insn->data, from, size);
 -}
 -
  #else /* !CONFIG_PLUGIN */
  static inline bool
@@ -XXX,XX +XXX,XX @@ static inline void plugin_gen_disable_mem_helpers(void)
  static inline void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info)
  { }
 -static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
 -{ }
 -
  #endif /* CONFIG_PLUGIN */
  #endif /* QEMU_PLUGIN_GEN_H */
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translator.c
 +++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ static void *translator_access(CPUArchState *env, DisasContextBase *db,
      return host + (pc - base);
  }
-+static void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
+-static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-+{
++static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op)
-+#ifdef CONFIG_PLUGIN
+ {
-+    struct qemu_plugin_insn *insn = tcg_ctx->plugin_insn;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-+    abi_ptr off;
+     MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
 +    uint64_t z_mask = -1, s_mask = 0;
      if (width < 64) {
          if (mop & MO_SIGN) {
 -            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
 +            s_mask = MAKE_64BIT_MASK(width - 1, 64 - (width - 1));
          } else {
 -            ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +            z_mask = MAKE_64BIT_MASK(0, width);
          }
      }
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
 -    return false;
 +
-+    if (insn == NULL) {
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
 +        return;
 +    }
 +    off = pc - insn->vaddr;
 +    if (off < insn->data->len) {
 +        g_byte_array_set_size(insn->data, off);
 +    } else if (off > insn->data->len) {
 +        /* we have an unexpected gap */
 +        g_assert_not_reached();
 +    }
 +
 +    insn->data = g_byte_array_append(insn->data, from, size);
 +#endif
 +}
 +
- uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
++static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op)
- {
++{
-     uint8_t ret;
++    /* Opcodes that touch guest memory stop the mb optimization.  */
 +    ctx->prev_mb = NULL;
 +    return finish_folding(ctx, op);
  }
  static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          case INDEX_op_qemu_ld_a32_i32:
          case INDEX_op_qemu_ld_a64_i32:
 +            done = fold_qemu_ld_1reg(&ctx, op);
 +            break;
          case INDEX_op_qemu_ld_a32_i64:
          case INDEX_op_qemu_ld_a64_i64:
 +            if (TCG_TARGET_REG_BITS == 64) {
 +                done = fold_qemu_ld_1reg(&ctx, op);
 +                break;
 +            }
 +            QEMU_FALLTHROUGH;
          case INDEX_op_qemu_ld_a32_i128:
          case INDEX_op_qemu_ld_a64_i128:
 -            done = fold_qemu_ld(&ctx, op);
 +            done = fold_qemu_ld_2reg(&ctx, op);
              break;
          case INDEX_op_qemu_st8_a32_i32:
          case INDEX_op_qemu_st8_a64_i32:
 --
-.34.1
+.43.0

-[PULL 36/52] accel/tcg: Introduce translator_io_start
+[PULL 34/72] tcg/optimize: Return true from fold_qemu_st, fold_tcg_st
-New wrapper around gen_io_start which takes care of the USE_ICOUNT
+Stores have no output operands, and so need no further work.
 check, as well as marking the DisasContext to end the TB.
 Remove exec/gen-icount.h.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- MAINTAINERS                                   |  1 -
+ tcg/optimize.c | 11 +++++------
- include/exec/gen-icount.h                     |  6 --
+file changed, 5 insertions(+), 6 deletions(-)
  include/exec/translator.h                     | 10 +++
  target/arm/cpregs.h                           |  4 +-
  accel/tcg/translator.c                        | 27 ++++++-
  target/alpha/translate.c                      | 15 +---
  target/arm/tcg/translate-a64.c                | 23 +++---
  target/arm/tcg/translate-mve.c                |  1 -
  target/arm/tcg/translate-neon.c               |  1 -
  target/arm/tcg/translate-vfp.c                |  4 +-
  target/arm/tcg/translate.c                    | 20 ++---
  target/avr/translate.c                        |  1 -
  target/cris/translate.c                       |  2 -
  target/hppa/translate.c                       |  5 +-
  target/i386/tcg/translate.c                   | 52 +++----------
  target/loongarch/translate.c                  |  2 -
  target/m68k/translate.c                       |  2 -
  target/microblaze/translate.c                 |  2 -
  target/mips/tcg/translate.c                   | 29 +++----
  target/nios2/translate.c                      |  1 -
  target/openrisc/translate.c                   |  9 +--
  target/ppc/translate.c                        | 13 +---
  target/riscv/translate.c                      |  2 -
  target/rx/translate.c                         |  2 -
  target/s390x/tcg/translate.c                  |  6 +-
  target/sh4/translate.c                        |  2 -
  target/sparc/translate.c                      | 75 +++++--------------
  target/tricore/translate.c                    |  2 -
  target/xtensa/translate.c                     | 27 ++-----
  target/loongarch/insn_trans/trans_extra.c.inc |  4 +-
  .../insn_trans/trans_privileged.c.inc         |  4 +-
  .../riscv/insn_trans/trans_privileged.c.inc   |  8 +-
  target/riscv/insn_trans/trans_rvi.c.inc       | 24 ++----
 files changed, 117 insertions(+), 269 deletions(-)
  delete mode 100644 include/exec/gen-icount.h
-diff --git a/MAINTAINERS b/MAINTAINERS
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/tcg/optimize.c
-+++ b/MAINTAINERS
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ F: ui/cocoa.m
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
  Main loop
  M: Paolo Bonzini <pbonzini@redhat.com>
  S: Maintained
 -F: include/exec/gen-icount.h
  F: include/qemu/main-loop.h
  F: include/sysemu/runstate.h
  F: include/sysemu/runstate-action.h
 diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/include/exec/gen-icount.h
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -#ifndef GEN_ICOUNT_H
 -#define GEN_ICOUNT_H
 -
 -void gen_io_start(void);
 -
 -#endif
 diff --git a/include/exec/translator.h b/include/exec/translator.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/translator.h
 +++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
   */
  bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest);
 +/**
 + * translator_io_start
 + * @db: Disassembly context
 + *
 + * If icount is enabled, set cpu->can_to_io, adjust db->is_jmp to
 + * DISAS_TOO_MANY if it is still DISAS_NEXT, and return true.
 + * Otherwise return false.
 + */
 +bool translator_io_start(DisasContextBase *db);
 +
  /*
   * Translator Load Functions
   *
 diff --git a/target/arm/cpregs.h b/target/arm/cpregs.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpregs.h
 +++ b/target/arm/cpregs.h
@@ -XXX,XX +XXX,XX @@ enum {
      ARM_CP_ALIAS                 = 1 << 8,
      /*
       * Flag: Register does I/O and therefore its accesses need to be marked
 -     * with gen_io_start() and also end the TB. In particular, registers which
 -     * implement clocks or timers require this.
 +     * with translator_io_start() and also end the TB. In particular,
 +     * registers which implement clocks or timers require this.
       */
      ARM_CP_IO                    = 1 << 9,
      /*
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translator.c
 +++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg.h"
  #include "tcg/tcg-op.h"
  #include "exec/exec-all.h"
 -#include "exec/gen-icount.h"
  #include "exec/log.h"
  #include "exec/translator.h"
  #include "exec/plugin-gen.h"
  #include "exec/replay-core.h"
 -void gen_io_start(void)
 +static void gen_io_start(void)
  {
-     tcg_gen_st_i32(tcg_constant_i32(1), cpu_env,
+     /* Opcodes that touch guest memory stop the mb optimization.  */
-                    offsetof(ArchCPU, parent_obj.can_do_io) -
+     ctx->prev_mb = NULL;
-                    offsetof(ArchCPU, env));
+-    return false;
 +    return true;
  }
-+bool translator_io_start(DisasContextBase *db)
+ static bool fold_remainder(OptContext *ctx, TCGOp *op)
-+{
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
-+    uint32_t cflags = tb_cflags(db->tb);
-+
+     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
-+    if (!(cflags & CF_USE_ICOUNT)) {
+         remove_mem_copy_all(ctx);
-+        return false;
+-        return false;
 +    }
 +    if (db->num_insns == db->max_insns && (cflags & CF_LAST_IO)) {
 +        /* Already started in translator_loop. */
 +        return true;
-+    }
-+
-+    gen_io_start();
-+
-+    /*
-+     * Ensure that this instruction will be the last in the TB.
-+     * The target may override this to something more forceful.
-+     */
-+    if (db->is_jmp == DISAS_NEXT) {
-+        db->is_jmp = DISAS_TOO_MANY;
-+    }
-+    return true;
-+}
-+
- static TCGOp *gen_tb_start(uint32_t cflags)
- {
-     TCGv_i32 count = tcg_temp_new_i32();
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
-+++ b/target/alpha/translate.c
-@@ -XXX,XX +XXX,XX @@ static TCGv cpu_lock_value;
- static TCGv cpu_pal_ir[31];
- #endif
--#include "exec/gen-icount.h"
--
- void alpha_translate_init(void)
- {
- #define DEF_VAR(V)  { &cpu_##V, #V, offsetof(CPUAlphaState, V) }
-@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_mfpr(DisasContext *ctx, TCGv va, int regno)
-     case 249: /* VMTIME */
-         helper = gen_helper_get_vmtime;
-     do_helper:
--        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
--            gen_io_start();
-+        if (translator_io_start(&ctx->base)) {
-             helper(va);
-             return DISAS_PC_STALE;
-         } else {
-@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_mtpr(DisasContext *ctx, TCGv vb, int regno)
-     case 251:
-         /* ALARM */
--        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
--            gen_io_start();
-+        if (translator_io_start(&ctx->base)) {
-             ret = DISAS_PC_STALE;
-         }
-         gen_helper_set_alarm(cpu_env, vb);
-@@ -XXX,XX +XXX,XX @@ static DisasJumpType translate_one(DisasContext *ctx, uint32_t insn)
-         case 0xC000:
-             /* RPCC */
-             va = dest_gpr(ctx, ra);
--            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
--                gen_io_start();
--                gen_helper_load_pcc(va, cpu_env);
-+            if (translator_io_start(&ctx->base)) {
-                 ret = DISAS_PC_STALE;
--            } else {
--                gen_helper_load_pcc(va, cpu_env);
-             }
-+            gen_helper_load_pcc(va, cpu_env);
-             break;
-         case 0xE000:
-             /* RC */
-diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
-index XXXXXXX..XXXXXXX 100644
---- a/target/arm/tcg/translate-a64.c
-+++ b/target/arm/tcg/translate-a64.c
-@@ -XXX,XX +XXX,XX @@
- #include "internals.h"
- #include "qemu/host-utils.h"
- #include "semihosting/semihost.h"
--#include "exec/gen-icount.h"
- #include "exec/log.h"
- #include "cpregs.h"
- #include "translate-a64.h"
-@@ -XXX,XX +XXX,XX @@ static bool trans_ERET(DisasContext *s, arg_ERET *a)
-     tcg_gen_ld_i64(dst, cpu_env,
-                    offsetof(CPUARMState, elr_el[s->current_el]));
--    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
--        gen_io_start();
--    }
-+    translator_io_start(&s->base);
-     gen_helper_exception_return(cpu_env, dst);
-     /* Must exit loop to check un-masked IRQs */
-@@ -XXX,XX +XXX,XX @@ static bool trans_ERETA(DisasContext *s, arg_reta *a)
-                    offsetof(CPUARMState, elr_el[s->current_el]));
-     dst = auth_branch_target(s, dst, cpu_X[31], !a->m);
--    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
--        gen_io_start();
--    }
-+
-+    translator_io_start(&s->base);
-     gen_helper_exception_return(cpu_env, dst);
-     /* Must exit loop to check un-masked IRQs */
-@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
-     uint32_t key = ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
-                                       crn, crm, op0, op1, op2);
-     const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
-+    bool need_exit_tb = false;
-     TCGv_ptr tcg_ri = NULL;
-     TCGv_i64 tcg_rt;
-@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
-         return;
      }
--    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+     switch (op->opc) {
--        gen_io_start();
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
 +    if (ri->type & ARM_CP_IO) {
 +        /* I/O operations must end the TB here (whether read or write) */
 +        need_exit_tb = translator_io_start(&s->base);
      }
      tcg_rt = cpu_reg(s, rt);
@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
          }
      }
 -    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
 -        /* I/O operations must end the TB here (whether read or write) */
 -        s->base.is_jmp = DISAS_UPDATE_EXIT;
 -    }
      if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
          /*
           * A write to any coprocessor regiser that ends a TB
@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
           * but allow this to be suppressed by the register definition
           * (usually only necessary to work around guest bugs).
           */
 +        need_exit_tb = true;
 +    }
 +    if (need_exit_tb) {
          s->base.is_jmp = DISAS_UPDATE_EXIT;
      }
  }
 diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-mve.c
 +++ b/target/arm/tcg/translate-mve.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg-op.h"
  #include "tcg/tcg-op-gvec.h"
  #include "exec/exec-all.h"
 -#include "exec/gen-icount.h"
  #include "translate.h"
  #include "translate-a32.h"
 diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-neon.c
 +++ b/target/arm/tcg/translate-neon.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg-op.h"
  #include "tcg/tcg-op-gvec.h"
  #include "exec/exec-all.h"
 -#include "exec/gen-icount.h"
  #include "translate.h"
  #include "translate-a32.h"
 diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-vfp.c
 +++ b/target/arm/tcg/translate-vfp.c
@@ -XXX,XX +XXX,XX @@
  #include "tcg/tcg-op.h"
  #include "tcg/tcg-op-gvec.h"
  #include "exec/exec-all.h"
 -#include "exec/gen-icount.h"
  #include "translate.h"
  #include "translate-a32.h"
@@ -XXX,XX +XXX,XX @@ static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update)
           * so we must mark it as an IO operation for icount (and cause
           * this to be the last insn in the TB).
           */
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 +        if (translator_io_start(&s->base)) {
              s->base.is_jmp = DISAS_UPDATE_EXIT;
 -            gen_io_start();
          }
          gen_helper_v7m_preserve_fp_state(cpu_env);
          /*
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "cpregs.h"
  #include "translate.h"
  #include "translate-a32.h"
 -#include "exec/gen-icount.h"
  #include "exec/helper-proto.h"
  #define HELPER_H "helper.h"
@@ -XXX,XX +XXX,XX @@ static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
       * appropriately depending on the new Thumb bit, so it must
       * be called after storing the new PC.
       */
 -    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&s->base);
      gen_helper_cpsr_write_eret(cpu_env, cpsr);
      /* Must exit loop to check un-masked IRQs */
      s->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
      uint32_t key = ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2);
      const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
      TCGv_ptr tcg_ri = NULL;
 -    bool need_exit_tb;
 +    bool need_exit_tb = false;
      uint32_t syndrome;
      /*
@@ -XXX,XX +XXX,XX @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
          g_assert_not_reached();
      }
+     remove_mem_copy_in(ctx, ofs, ofs + lm1);
--    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
+-    return false;
--        gen_io_start();
++    return true;
-+    if (ri->type & ARM_CP_IO) {
+ }
-+        /* I/O operations must end the TB here (whether read or write) */
-+        need_exit_tb = translator_io_start(&s->base);
+ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
      TCGType type;
      if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
 -        fold_tcg_st(ctx, op);
 -        return false;
 +        return fold_tcg_st(ctx, op);
      }
-     if (isread) {
+     src = arg_temp(op->args[0]);
-@@ -XXX,XX +XXX,XX @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
-         }
+     last = ofs + tcg_type_size(type) - 1;
-     }
+     remove_mem_copy_in(ctx, ofs, last);
+     record_mem_copy(ctx, type, src, ofs, last);
--    /* I/O operations must end the TB here (whether read or write) */
+-    return false;
--    need_exit_tb = ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) &&
++    return true;
 -                    (ri->type & ARM_CP_IO));
 -
      if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
          /*
           * A write to any coprocessor register that ends a TB
@@ -XXX,XX +XXX,XX @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
      if (exc_return) {
          /* Restore CPSR from SPSR.  */
          tmp = load_cpu_field(spsr);
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -        }
 +        translator_io_start(&s->base);
          gen_helper_cpsr_write_eret(cpu_env, tmp);
          /* Must exit loop to check un-masked IRQs */
          s->base.is_jmp = DISAS_EXIT;
 diff --git a/target/avr/translate.c b/target/avr/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/translate.c
 +++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-gen.h"
  #include "exec/log.h"
  #include "exec/translator.h"
 -#include "exec/gen-icount.h"
  #define HELPER_H "helper.h"
  #include "exec/helper-info.c.inc"
 diff --git a/target/cris/translate.c b/target/cris/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/cris/translate.c
 +++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv env_btaken;
  static TCGv env_btarget;
  static TCGv env_pc;
 -#include "exec/gen-icount.h"
 -
  /* This is the state at translation time.  */
  typedef struct DisasContext {
      DisasContextBase base;
 diff --git a/target/hppa/translate.c b/target/hppa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/translate.c
 +++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_reg cpu_psw_v;
  static TCGv_reg cpu_psw_cb;
  static TCGv_reg cpu_psw_cb_msb;
 -#include "exec/gen-icount.h"
 -
  void hppa_translate_init(void)
  {
  #define DEF_VAR(V)  { &cpu_##V, #V, offsetof(CPUHPPAState, V) }
@@ -XXX,XX +XXX,XX @@ static bool trans_mfctl(DisasContext *ctx, arg_mfctl *a)
          /* FIXME: Respect PSW_S bit.  */
          nullify_over(ctx);
          tmp = dest_gpr(ctx, rt);
 -        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 +        if (translator_io_start(&ctx->base)) {
              gen_helper_read_interval_timer(tmp);
              ctx->base.is_jmp = DISAS_IAQ_N_STALE;
          } else {
 diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/translate.c
 +++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_seg_base[6];
  static TCGv_i64 cpu_bndl[4];
  static TCGv_i64 cpu_bndu[4];
 -#include "exec/gen-icount.h"
 -
  typedef struct DisasContext {
      DisasContextBase base;
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
                  !(s->cpuid_ext_features & CPUID_EXT_RDRAND)) {
                  goto illegal_op;
              }
 -            if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -                gen_io_start();
 -                s->base.is_jmp = DISAS_TOO_MANY;
 -            }
 +            translator_io_start(&s->base);
              gen_helper_rdrand(s->T0, cpu_env);
              rm = (modrm & 7) | REX_B(s);
              gen_op_mov_reg_v(s, dflag, rm, s->T0);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
                            SVM_IOIO_TYPE_MASK | SVM_IOIO_STR_MASK)) {
              break;
          }
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
              gen_repz_ins(s, ot);
          } else {
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
          if (!gen_check_io(s, ot, s->tmp2_i32, SVM_IOIO_STR_MASK)) {
              break;
          }
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
              gen_repz_outs(s, ot);
          } else {
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
          if (!gen_check_io(s, ot, s->tmp2_i32, SVM_IOIO_TYPE_MASK)) {
              break;
          }
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          gen_helper_in_func(ot, s->T1, s->tmp2_i32);
          gen_op_mov_reg_v(s, ot, R_EAX, s->T1);
          gen_bpt_io(s, s->tmp2_i32, ot);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
          if (!gen_check_io(s, ot, s->tmp2_i32, 0)) {
              break;
          }
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          gen_op_mov_v_reg(s, ot, s->T1, R_EAX);
          tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
          gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
          if (!gen_check_io(s, ot, s->tmp2_i32, SVM_IOIO_TYPE_MASK)) {
              break;
          }
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          gen_helper_in_func(ot, s->T1, s->tmp2_i32);
          gen_op_mov_reg_v(s, ot, R_EAX, s->T1);
          gen_bpt_io(s, s->tmp2_i32, ot);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
          if (!gen_check_io(s, ot, s->tmp2_i32, 0)) {
              break;
          }
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          gen_op_mov_v_reg(s, ot, s->T1, R_EAX);
          tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
          gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
      case 0x131: /* rdtsc */
          gen_update_cc_op(s);
          gen_update_eip_cur(s);
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          gen_helper_rdtsc(cpu_env);
          break;
      case 0x133: /* rdpmc */
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
              }
              gen_update_cc_op(s);
              gen_update_eip_cur(s);
 -            if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -                gen_io_start();
 -                s->base.is_jmp = DISAS_TOO_MANY;
 -            }
 +            translator_io_start(&s->base);
              gen_helper_rdtscp(cpu_env);
              break;
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
          }
          ot  = (CODE64(s) ? MO_64 : MO_32);
 -        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -            s->base.is_jmp = DISAS_TOO_MANY;
 -        }
 +        translator_io_start(&s->base);
          if (b & 2) {
              gen_svm_check_intercept(s, SVM_EXIT_WRITE_CR0 + reg);
              gen_op_mov_v_reg(s, ot, s->T0, rm);
 diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/translate.c
 +++ b/target/loongarch/translate.c
@@ -XXX,XX +XXX,XX @@
  TCGv cpu_gpr[32], cpu_pc;
  static TCGv cpu_lladdr, cpu_llval;
 -#include "exec/gen-icount.h"
 -
  #define HELPER_H "helper.h"
  #include "exec/helper-info.c.inc"
  #undef  HELPER_H
 diff --git a/target/m68k/translate.c b/target/m68k/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/translate.c
 +++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv NULL_QREG;
  /* Used to distinguish stores from bad addressing modes.  */
  static TCGv store_dummy;
 -#include "exec/gen-icount.h"
 -
  void m68k_tcg_init(void)
  {
      char *p;
 diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/translate.c
 +++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 cpu_iflags;
  static TCGv cpu_res_addr;
  static TCGv_i32 cpu_res_val;
 -#include "exec/gen-icount.h"
 -
  /* This is the state at translation time.  */
  typedef struct DisasContext {
      DisasContextBase base;
 diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/translate.c
 +++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 hflags;
  TCGv_i32 fpu_fcr0, fpu_fcr31;
  TCGv_i64 fpu_f64[32];
 -#include "exec/gen-icount.h"
 -
  static const char regnames_HI[][4] = {
      "HI0", "HI1", "HI2", "HI3",
  };
@@ -XXX,XX +XXX,XX @@ static void gen_mfc0(DisasContext *ctx, TCGv arg, int reg, int sel)
          switch (sel) {
          case CP0_REG09__COUNT:
              /* Mark as an IO operation because we read the time.  */
 -            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -                gen_io_start();
 -            }
 +            translator_io_start(&ctx->base);
 +
              gen_helper_mfc0_count(arg, cpu_env);
              /*
               * Break the TB to be able to take timer interrupts immediately
@@ -XXX,XX +XXX,XX @@ cp0_unimplemented:
  static void gen_mtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
  {
      const char *register_name = "invalid";
 +    bool icount;
      if (sel != 0) {
          check_insn(ctx, ISA_MIPS_R1);
      }
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    icount = translator_io_start(&ctx->base);
      switch (reg) {
      case CP0_REGISTER_00:
@@ -XXX,XX +XXX,XX @@ static void gen_mtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
      trace_mips_translate_c0("mtc0", register_name, reg, sel);
      /* For simplicity assume that all writes can cause interrupts.  */
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 +    if (icount) {
          /*
           * DISAS_STOP isn't sufficient, we need to ensure we break out of
           * translated code to check for pending interrupts.
@@ -XXX,XX +XXX,XX @@ static void gen_dmfc0(DisasContext *ctx, TCGv arg, int reg, int sel)
          switch (sel) {
          case CP0_REG09__COUNT:
              /* Mark as an IO operation because we read the time.  */
 -            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -                gen_io_start();
 -            }
 +            translator_io_start(&ctx->base);
              gen_helper_mfc0_count(arg, cpu_env);
              /*
               * Break the TB to be able to take timer interrupts immediately
@@ -XXX,XX +XXX,XX @@ cp0_unimplemented:
  static void gen_dmtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
  {
      const char *register_name = "invalid";
 +    bool icount;
      if (sel != 0) {
          check_insn(ctx, ISA_MIPS_R1);
      }
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    icount = translator_io_start(&ctx->base);
      switch (reg) {
      case CP0_REGISTER_00:
@@ -XXX,XX +XXX,XX @@ static void gen_dmtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
      trace_mips_translate_c0("dmtc0", register_name, reg, sel);
      /* For simplicity assume that all writes can cause interrupts.  */
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 +    if (icount) {
          /*
           * DISAS_STOP isn't sufficient, we need to ensure we break out of
           * translated code to check for pending interrupts.
@@ -XXX,XX +XXX,XX @@ void gen_rdhwr(DisasContext *ctx, int rt, int rd, int sel)
          gen_store_gpr(t0, rt);
          break;
      case 2:
 -        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -        }
 +        translator_io_start(&ctx->base);
          gen_helper_rdhwr_cc(t0, cpu_env);
          gen_store_gpr(t0, rt);
          /*
 diff --git a/target/nios2/translate.c b/target/nios2/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/nios2/translate.c
 +++ b/target/nios2/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/cpu_ldst.h"
  #include "exec/translator.h"
  #include "qemu/qemu-print.h"
 -#include "exec/gen-icount.h"
  #include "semihosting/semihost.h"
  #define HELPER_H "helper.h"
 diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/translate.c
 +++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
 -#include "exec/gen-icount.h"
  #include "exec/log.h"
@@ -XXX,XX +XXX,XX @@ static bool trans_l_mfspr(DisasContext *dc, arg_l_mfspr *a)
      check_r0_write(dc, a->d);
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 +    if (translator_io_start(&dc->base)) {
          if (dc->delayed_branch) {
              tcg_gen_mov_tl(cpu_pc, jmp_pc);
              tcg_gen_discard_tl(jmp_pc);
@@ -XXX,XX +XXX,XX @@ static bool trans_l_mtspr(DisasContext *dc, arg_l_mtspr *a)
  {
      TCGv spr = tcg_temp_new();
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&dc->base);
 +
      /*
       * For SR, we will need to exit the TB to recognize the new
       * exception state.  For NPC, in theory this counts as a branch
 diff --git a/target/ppc/translate.c b/target/ppc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate.c
 +++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_reserve_val2;
  static TCGv cpu_fpscr;
  static TCGv_i32 cpu_access_type;
 -#include "exec/gen-icount.h"
 -
  void ppc_translate_init(void)
  {
      int i;
@@ -XXX,XX +XXX,XX @@ static void gen_exception_nip(DisasContext *ctx, uint32_t excp,
  static void gen_icount_io_start(DisasContext *ctx)
  {
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -        /*
 -         * An I/O instruction must be last in the TB.
 -         * Chain to the next TB, and let the code from gen_tb_start
 -         * decide if we need to return to the main loop.
 -         * Doing this first also allows this value to be overridden.
 -         */
 -        ctx->base.is_jmp = DISAS_TOO_MANY;
 -    }
 +    translator_io_start(&ctx->base);
  }
- #if !defined(CONFIG_USER_ONLY)
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
 diff --git a/target/riscv/translate.c b/target/riscv/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/translate.c
 +++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv load_val;
  static TCGv pm_mask;
  static TCGv pm_base;
 -#include "exec/gen-icount.h"
 -
  /*
   * If an operation is being performed on less than TARGET_LONG_BITS,
   * it may require the inputs to be sign- or zero-extended; which will
 diff --git a/target/rx/translate.c b/target/rx/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/translate.c
 +++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i64 cpu_acc;
  #define cpu_sp cpu_regs[0]
 -#include "exec/gen-icount.h"
 -
  /* decoder helper */
  static uint32_t decode_load_bytes(DisasContext *ctx, uint32_t insn,
                             int i, int n)
 diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/translate.c
 +++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/log.h"
  #include "qemu/host-utils.h"
  #include "exec/cpu_ldst.h"
 -#include "exec/gen-icount.h"
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
@@ -XXX,XX +XXX,XX @@ static DisasJumpType translate_one(CPUS390XState *env, DisasContext *s)
          /* input/output is the special case for icount mode */
          if (unlikely(insn->flags & IF_IO)) {
 -            icount = tb_cflags(s->base.tb) & CF_USE_ICOUNT;
 -            if (icount) {
 -                gen_io_start();
 -            }
 +            icount = translator_io_start(&s->base);
          }
      }
 diff --git a/target/sh4/translate.c b/target/sh4/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/translate.c
 +++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_fregs[32];
  /* internal register indexes */
  static TCGv cpu_flags, cpu_delayed_pc, cpu_delayed_cond;
 -#include "exec/gen-icount.h"
 -
  void sh4_translate_init(void)
  {
      int i;
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/translate.c
 +++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_wim;
  /* Floating point registers */
  static TCGv_i64 cpu_fpr[TARGET_DPREGS];
 -#include "exec/gen-icount.h"
 -
  typedef struct DisasContext {
      DisasContextBase base;
      target_ulong pc;    /* current Program Counter: integer or DYNAMIC_PC */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                          r_const = tcg_constant_i32(dc->mem_idx);
                          tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                         offsetof(CPUSPARCState, tick));
 -                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                            gen_io_start();
 +                        if (translator_io_start(&dc->base)) {
 +                            dc->base.is_jmp = DISAS_EXIT;
                          }
                          gen_helper_tick_get_count(cpu_dst, cpu_env, r_tickptr,
                                                    r_const);
                          gen_store_gpr(dc, rd, cpu_dst);
 -                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                            /* I/O operations in icount mode must end the TB */
 -                            dc->base.is_jmp = DISAS_EXIT;
 -                        }
                      }
                      break;
                  case 0x5: /* V9 rdpc */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                          r_const = tcg_constant_i32(dc->mem_idx);
                          tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                         offsetof(CPUSPARCState, stick));
 -                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                            gen_io_start();
 +                        if (translator_io_start(&dc->base)) {
 +                            dc->base.is_jmp = DISAS_EXIT;
                          }
                          gen_helper_tick_get_count(cpu_dst, cpu_env, r_tickptr,
                                                    r_const);
                          gen_store_gpr(dc, rd, cpu_dst);
 -                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                            /* I/O operations in icount mode must end the TB */
 -                            dc->base.is_jmp = DISAS_EXIT;
 -                        }
                      }
                      break;
                  case 0x19: /* System tick compare */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                          r_const = tcg_constant_i32(dc->mem_idx);
                          tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                         offsetof(CPUSPARCState, tick));
 -                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                            gen_io_start();
 +                        if (translator_io_start(&dc->base)) {
 +                            dc->base.is_jmp = DISAS_EXIT;
                          }
                          gen_helper_tick_get_count(cpu_tmp0, cpu_env,
                                                    r_tickptr, r_const);
 -                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                            /* I/O operations in icount mode must end the TB */
 -                            dc->base.is_jmp = DISAS_EXIT;
 -                        }
                      }
                      break;
                  case 5: // tba
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                      r_tickptr = tcg_temp_new_ptr();
                                      tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                     offsetof(CPUSPARCState, tick));
 -                                    if (tb_cflags(dc->base.tb) &
 -                                           CF_USE_ICOUNT) {
 -                                        gen_io_start();
 -                                    }
 +                                    translator_io_start(&dc->base);
                                      gen_helper_tick_set_limit(r_tickptr,
                                                                cpu_tick_cmpr);
                                      /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                      r_tickptr = tcg_temp_new_ptr();
                                      tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                     offsetof(CPUSPARCState, stick));
 -                                    if (tb_cflags(dc->base.tb) &
 -                                           CF_USE_ICOUNT) {
 -                                        gen_io_start();
 -                                    }
 +                                    translator_io_start(&dc->base);
                                      gen_helper_tick_set_count(r_tickptr,
                                                                cpu_tmp0);
                                      /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                      r_tickptr = tcg_temp_new_ptr();
                                      tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                     offsetof(CPUSPARCState, stick));
 -                                    if (tb_cflags(dc->base.tb) &
 -                                           CF_USE_ICOUNT) {
 -                                        gen_io_start();
 -                                    }
 +                                    translator_io_start(&dc->base);
                                      gen_helper_tick_set_limit(r_tickptr,
                                                                cpu_stick_cmpr);
                                      /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                      r_tickptr = tcg_temp_new_ptr();
                                      tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                     offsetof(CPUSPARCState, tick));
 -                                    if (tb_cflags(dc->base.tb) &
 -                                           CF_USE_ICOUNT) {
 -                                        gen_io_start();
 -                                    }
 +                                    translator_io_start(&dc->base);
                                      gen_helper_tick_set_count(r_tickptr,
                                                                cpu_tmp0);
                                      /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                  break;
                              case 6: // pstate
                                  save_state(dc);
 -                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                                    gen_io_start();
 -                                }
 -                                gen_helper_wrpstate(cpu_env, cpu_tmp0);
 -                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                                    /* I/O ops in icount mode must end the TB */
 +                                if (translator_io_start(&dc->base)) {
                                      dc->base.is_jmp = DISAS_EXIT;
                                  }
 +                                gen_helper_wrpstate(cpu_env, cpu_tmp0);
                                  dc->npc = DYNAMIC_PC;
                                  break;
                              case 7: // tl
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                  dc->npc = DYNAMIC_PC;
                                  break;
                              case 8: // pil
 -                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                                    gen_io_start();
 -                                }
 -                                gen_helper_wrpil(cpu_env, cpu_tmp0);
 -                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                                    /* I/O ops in icount mode must end the TB */
 +                                if (translator_io_start(&dc->base)) {
                                      dc->base.is_jmp = DISAS_EXIT;
                                  }
 +                                gen_helper_wrpil(cpu_env, cpu_tmp0);
                                  break;
                              case 9: // cwp
                                  gen_helper_wrcwp(cpu_env, cpu_tmp0);
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                      r_tickptr = tcg_temp_new_ptr();
                                      tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                     offsetof(CPUSPARCState, hstick));
 -                                    if (tb_cflags(dc->base.tb) &
 -                                           CF_USE_ICOUNT) {
 -                                        gen_io_start();
 -                                    }
 +                                    translator_io_start(&dc->base);
                                      gen_helper_tick_set_limit(r_tickptr,
                                                                cpu_hstick_cmpr);
                                      /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                  goto priv_insn;
                              dc->npc = DYNAMIC_PC;
                              dc->pc = DYNAMIC_PC;
 -                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                                gen_io_start();
 -                            }
 +                            translator_io_start(&dc->base);
                              gen_helper_done(cpu_env);
                              goto jmp_insn;
                          case 1:
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                  goto priv_insn;
                              dc->npc = DYNAMIC_PC;
                              dc->pc = DYNAMIC_PC;
 -                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -                                gen_io_start();
 -                            }
 +                            translator_io_start(&dc->base);
                              gen_helper_retry(cpu_env);
                              goto jmp_insn;
                          default:
 diff --git a/target/tricore/translate.c b/target/tricore/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/translate.c
 +++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_PSW_SV;
  static TCGv cpu_PSW_AV;
  static TCGv cpu_PSW_SAV;
 -#include "exec/gen-icount.h"
 -
  static const char *regnames_a[] = {
        "a0"  , "a1"  , "a2"  , "a3" , "a4"  , "a5" ,
        "a6"  , "a7"  , "a8"  , "a9" , "sp" , "a11" ,
 diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/translate.c
 +++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 cpu_exclusive_val;
  static GHashTable *xtensa_regfile_table;
 -#include "exec/gen-icount.h"
 -
  static char *sr_name[256];
  static char *ur_name[256];
@@ -XXX,XX +XXX,XX @@ static int gen_postprocess(DisasContext *dc, int slot)
  #ifndef CONFIG_USER_ONLY
      if (op_flags & XTENSA_OP_CHECK_INTERRUPTS) {
 -        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -        }
 +        translator_io_start(&dc->base);
          gen_helper_check_interrupts(cpu_env);
      }
  #endif
@@ -XXX,XX +XXX,XX @@ static void translate_rsr_ccount(DisasContext *dc, const OpcodeArg arg[],
                                   const uint32_t par[])
  {
  #ifndef CONFIG_USER_ONLY
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&dc->base);
      gen_helper_update_ccount(cpu_env);
      tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]);
  #endif
@@ -XXX,XX +XXX,XX @@ static void translate_waiti(DisasContext *dc, const OpcodeArg arg[],
  #ifndef CONFIG_USER_ONLY
      TCGv_i32 pc = tcg_constant_i32(dc->base.pc_next);
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&dc->base);
      gen_helper_waiti(cpu_env, pc, tcg_constant_i32(arg[0].imm));
  #endif
  }
@@ -XXX,XX +XXX,XX @@ static void translate_wsr_ccompare(DisasContext *dc, const OpcodeArg arg[],
      uint32_t id = par[0] - CCOMPARE;
      assert(id < dc->config->nccompare);
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&dc->base);
      tcg_gen_mov_i32(cpu_SR[par[0]], arg[0].in);
      gen_helper_update_ccompare(cpu_env, tcg_constant_i32(id));
  #endif
@@ -XXX,XX +XXX,XX @@ static void translate_wsr_ccount(DisasContext *dc, const OpcodeArg arg[],
                                   const uint32_t par[])
  {
  #ifndef CONFIG_USER_ONLY
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&dc->base);
      gen_helper_wsr_ccount(cpu_env, arg[0].in);
  #endif
  }
@@ -XXX,XX +XXX,XX @@ static void translate_xsr_ccount(DisasContext *dc, const OpcodeArg arg[],
  #ifndef CONFIG_USER_ONLY
      TCGv_i32 tmp = tcg_temp_new_i32();
 -    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 -
 +    translator_io_start(&dc->base);
      gen_helper_update_ccount(cpu_env);
      tcg_gen_mov_i32(tmp, cpu_SR[par[0]]);
      gen_helper_wsr_ccount(cpu_env, arg[0].in);
 diff --git a/target/loongarch/insn_trans/trans_extra.c.inc b/target/loongarch/insn_trans/trans_extra.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/insn_trans/trans_extra.c.inc
 +++ b/target/loongarch/insn_trans/trans_extra.c.inc
@@ -XXX,XX +XXX,XX @@ static bool gen_rdtime(DisasContext *ctx, arg_rr *a,
      TCGv dst1 = gpr_dst(ctx, a->rd, EXT_NONE);
      TCGv dst2 = gpr_dst(ctx, a->rj, EXT_NONE);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_rdtime_d(dst1, cpu_env);
      if (word) {
          tcg_gen_sextract_tl(dst1, dst1, high ? 32 : 0, 32);
 diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc b/target/loongarch/insn_trans/trans_privileged.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/insn_trans/trans_privileged.c.inc
 +++ b/target/loongarch/insn_trans/trans_privileged.c.inc
@@ -XXX,XX +XXX,XX @@ static bool check_csr_flags(DisasContext *ctx, const CSRInfo *csr, bool write)
      if ((csr->flags & CSRFL_READONLY) && write) {
          return false;
      }
 -    if ((csr->flags & CSRFL_IO) &&
 -        (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT)) {
 -        gen_io_start();
 +    if ((csr->flags & CSRFL_IO) && translator_io_start(&ctx->base)) {
          ctx->base.is_jmp = DISAS_EXIT_UPDATE;
      } else if ((csr->flags & CSRFL_EXITTB) && write) {
          ctx->base.is_jmp = DISAS_EXIT_UPDATE;
 diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/insn_trans/trans_privileged.c.inc
 +++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_sret(DisasContext *ctx, arg_sret *a)
  #ifndef CONFIG_USER_ONLY
      if (has_ext(ctx, RVS)) {
          decode_save_opc(ctx);
 -        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -            gen_io_start();
 -        }
 +        translator_io_start(&ctx->base);
          gen_helper_sret(cpu_pc, cpu_env);
          exit_tb(ctx); /* no chaining */
          ctx->base.is_jmp = DISAS_NORETURN;
@@ -XXX,XX +XXX,XX @@ static bool trans_mret(DisasContext *ctx, arg_mret *a)
  {
  #ifndef CONFIG_USER_ONLY
      decode_save_opc(ctx);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_mret(cpu_pc, cpu_env);
      exit_tb(ctx); /* no chaining */
      ctx->base.is_jmp = DISAS_NORETURN;
 diff --git a/target/riscv/insn_trans/trans_rvi.c.inc b/target/riscv/insn_trans/trans_rvi.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/insn_trans/trans_rvi.c.inc
 +++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -XXX,XX +XXX,XX @@ static bool do_csrr(DisasContext *ctx, int rd, int rc)
      TCGv dest = dest_gpr(ctx, rd);
      TCGv_i32 csr = tcg_constant_i32(rc);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_csrr(dest, cpu_env, csr);
      gen_set_gpr(ctx, rd, dest);
      return do_csr_post(ctx);
@@ -XXX,XX +XXX,XX @@ static bool do_csrw(DisasContext *ctx, int rc, TCGv src)
  {
      TCGv_i32 csr = tcg_constant_i32(rc);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_csrw(cpu_env, csr, src);
      return do_csr_post(ctx);
  }
@@ -XXX,XX +XXX,XX @@ static bool do_csrrw(DisasContext *ctx, int rd, int rc, TCGv src, TCGv mask)
      TCGv dest = dest_gpr(ctx, rd);
      TCGv_i32 csr = tcg_constant_i32(rc);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_csrrw(dest, cpu_env, csr, src, mask);
      gen_set_gpr(ctx, rd, dest);
      return do_csr_post(ctx);
@@ -XXX,XX +XXX,XX @@ static bool do_csrr_i128(DisasContext *ctx, int rd, int rc)
      TCGv desth = dest_gprh(ctx, rd);
      TCGv_i32 csr = tcg_constant_i32(rc);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_csrr_i128(destl, cpu_env, csr);
      tcg_gen_ld_tl(desth, cpu_env, offsetof(CPURISCVState, retxh));
      gen_set_gpr128(ctx, rd, destl, desth);
@@ -XXX,XX +XXX,XX @@ static bool do_csrw_i128(DisasContext *ctx, int rc, TCGv srcl, TCGv srch)
  {
      TCGv_i32 csr = tcg_constant_i32(rc);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_csrw_i128(cpu_env, csr, srcl, srch);
      return do_csr_post(ctx);
  }
@@ -XXX,XX +XXX,XX @@ static bool do_csrrw_i128(DisasContext *ctx, int rd, int rc,
      TCGv desth = dest_gprh(ctx, rd);
      TCGv_i32 csr = tcg_constant_i32(rc);
 -    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
 -        gen_io_start();
 -    }
 +    translator_io_start(&ctx->base);
      gen_helper_csrrw_i128(destl, cpu_env, csr, srcl, srch, maskl, maskh);
      tcg_gen_ld_tl(desth, cpu_env, offsetof(CPURISCVState, retxh));
      gen_set_gpr128(ctx, rd, destl, desth);
 --
-.34.1
+.43.0

-[PULL 50/52] tcg: Build once for system and once for user-only
+[PULL 35/72] tcg/optimize: Use finish_folding in fold_remainder
-Create two static libraries for use by each execution mode.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/meson.build | 30 +++++++++++++++++++++++++++---
+ tcg/optimize.c | 2 +-
-file changed, 27 insertions(+), 3 deletions(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/meson.build b/tcg/meson.build
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/meson.build
+--- a/tcg/optimize.c
-+++ b/tcg/meson.build
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
-+if not get_option('tcg').allowed()
+         fold_xx_to_i(ctx, op, 0)) {
-+   subdir_done()
+         return true;
-+endif
+     }
-+
+-    return false;
- tcg_ss = ss.source_set()
++    return finish_folding(ctx, op);
+ }
- tcg_ss.add(files(
-@@ -XXX,XX +XXX,XX @@ tcg_ss.add(files(
+ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
  if get_option('tcg_interpreter')
    libffi = dependency('libffi', version: '>=3.0', required: true,
                        method: 'pkg-config')
 -  specific_ss.add(libffi)
 -  specific_ss.add(files('tci.c'))
 +  tcg_ss.add(libffi)
 +  tcg_ss.add(files('tci.c'))
  endif
 -specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 +tcg_ss = tcg_ss.apply(config_host, strict: false)
 +
 +libtcg_user = static_library('tcg_user',
 +                             tcg_ss.sources() + genh,
 +                             name_suffix: 'fa',
 +                             c_args: '-DCONFIG_USER_ONLY',
 +                             build_by_default: have_user)
 +
 +tcg_user = declare_dependency(link_with: libtcg_user,
 +                              dependencies: tcg_ss.dependencies())
 +user_ss.add(tcg_user)
 +
 +libtcg_softmmu = static_library('tcg_softmmu',
 +                                tcg_ss.sources() + genh,
 +                                name_suffix: 'fa',
 +                                c_args: '-DCONFIG_SOFTMMU',
 +                                build_by_default: have_system)
 +
 +tcg_softmmu = declare_dependency(link_with: libtcg_softmmu,
 +                                 dependencies: tcg_ss.dependencies())
 +softmmu_ss.add(tcg_softmmu)
 --
-.34.1
+.43.0

-[PULL 06/52] tcg: Widen CPUTLBEntry comparators to 64-bits
+[PULL 36/72] tcg/optimize: Distinguish simplification in fold_setcond_zmask
-This makes CPUTLBEntry agnostic to the address size of the guest.
+Change return from bool to int; distinguish between
-When 32-bit addresses are in effect, we can simply read the low
+complete folding, simplification, and no change.
 bits of the 64-bit field.  Similarly when we need to update
 the field for setting TLB_NOTDIRTY.
-For TCG backends that could in theory be big-endian, but in
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 practice are not (arm, loongarch, riscv), use QEMU_BUILD_BUG_ON
 to document and ensure this is not accidentally missed.
 For s390x, which is always big-endian, use HOST_BIG_ENDIAN anyway,
 to document the reason for the adjustment.
 For sparc64 and ppc64, always perform a 64-bit load, and rely on
 the following 32-bit comparison to ignore the high bits.
 Rearrange mips and ppc if ladders for clarity.
 Reviewed-by: Anton Johansson <anjo@rev.ng>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-defs.h          | 37 +++++++++++---------------------
+ tcg/optimize.c | 22 ++++++++++++++--------
- include/exec/cpu_ldst.h          | 19 ++++++++++------
+file changed, 14 insertions(+), 8 deletions(-)
  accel/tcg/cputlb.c               |  8 +++++--
  tcg/aarch64/tcg-target.c.inc     |  1 +
  tcg/arm/tcg-target.c.inc         |  1 +
  tcg/loongarch64/tcg-target.c.inc |  1 +
  tcg/mips/tcg-target.c.inc        | 13 ++++++-----
  tcg/ppc/tcg-target.c.inc         | 28 +++++++++++++-----------
  tcg/riscv/tcg-target.c.inc       |  1 +
  tcg/s390x/tcg-target.c.inc       |  1 +
  tcg/sparc64/tcg-target.c.inc     |  8 +++++--
 files changed, 67 insertions(+), 51 deletions(-)
-diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-defs.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-defs.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
- /* use a fully associative victim tlb of 8 entries */
+     return finish_folding(ctx, op);
- #define CPU_VTLB_SIZE 8
+ }
--#if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
+-static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
--#define CPU_TLB_ENTRY_BITS 4
++/* Return 1 if finished, -1 if simplified, 0 if unchanged. */
--#else
++static int fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
  #define CPU_TLB_ENTRY_BITS 5
 -#endif
  #define CPU_TLB_DYN_MIN_BITS 6
  #define CPU_TLB_DYN_DEFAULT_BITS 8
@@ -XXX,XX +XXX,XX @@
  # endif
  /* Minimalized TLB entry for use by TCG fast path. */
 -typedef struct CPUTLBEntry {
 -    /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
 -       bit TARGET_PAGE_BITS-1..4  : Nonzero for accesses that should not
 -                                    go directly to ram.
 -       bit 3                      : indicates that the entry is invalid
 -       bit 2..0                   : zero
 -    */
 -    union {
 -        struct {
 -            target_ulong addr_read;
 -            target_ulong addr_write;
 -            target_ulong addr_code;
 -            /* Addend to virtual address to get host address.  IO accesses
 -               use the corresponding iotlb value.  */
 -            uintptr_t addend;
 -        };
 +typedef union CPUTLBEntry {
 +    struct {
 +        uint64_t addr_read;
 +        uint64_t addr_write;
 +        uint64_t addr_code;
          /*
 -         * Padding to get a power of two size, as well as index
 -         * access to addr_{read,write,code}.
 +         * Addend to virtual address to get host address.  IO accesses
 +         * use the corresponding iotlb value.
           */
 -        target_ulong addr_idx[(1 << CPU_TLB_ENTRY_BITS) / TARGET_LONG_SIZE];
 +        uintptr_t addend;
      };
 +    /*
 +     * Padding to get a power of two size, as well as index
 +     * access to addr_{read,write,code}.
 +     */
 +    uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
  } CPUTLBEntry;
  QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
 -
  #endif  /* !CONFIG_USER_ONLY && CONFIG_TCG */
  #if !defined(CONFIG_USER_ONLY)
 diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu_ldst.h
 +++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline target_ulong tlb_read_idx(const CPUTLBEntry *entry,
  {
-     /* Do not rearrange the CPUTLBEntry structure members. */
+     uint64_t a_zmask, b_val;
-     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_read) !=
+     TCGCond cond;
--                      MMU_DATA_LOAD * TARGET_LONG_SIZE);
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
-+                      MMU_DATA_LOAD * sizeof(uint64_t));
+                 op->opc = xor_opc;
-     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) !=
+                 op->args[2] = arg_new_constant(ctx, 1);
--                      MMU_DATA_STORE * TARGET_LONG_SIZE);
+             }
-+                      MMU_DATA_STORE * sizeof(uint64_t));
+-            return false;
-     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) !=
++            return -1;
 -                      MMU_INST_FETCH * TARGET_LONG_SIZE);
 +                      MMU_INST_FETCH * sizeof(uint64_t));
 -    const target_ulong *ptr = &entry->addr_idx[access_type];
 -#if TCG_OVERSIZED_GUEST
 -    return *ptr;
 +#if TARGET_LONG_BITS == 32
 +    /* Use qatomic_read, in case of addr_write; only care about low bits. */
 +    const uint32_t *ptr = (uint32_t *)&entry->addr_idx[access_type];
 +    ptr += HOST_BIG_ENDIAN;
 +    return qatomic_read(ptr);
  #else
 +    const uint64_t *ptr = &entry->addr_idx[access_type];
 +# if TCG_OVERSIZED_GUEST
 +    return *ptr;
 +# else
      /* ofs might correspond to .addr_write, so use qatomic_read */
      return qatomic_read(ptr);
 +# endif
  #endif
  }
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
          addr &= TARGET_PAGE_MASK;
          addr += tlb_entry->addend;
          if ((addr - start) < length) {
 -#if TCG_OVERSIZED_GUEST
 +#if TARGET_LONG_BITS == 32
 +            uint32_t *ptr_write = (uint32_t *)&tlb_entry->addr_write;
 +            ptr_write += HOST_BIG_ENDIAN;
 +            qatomic_set(ptr_write, *ptr_write | TLB_NOTDIRTY);
 +#elif TCG_OVERSIZED_GUEST
              tlb_entry->addr_write |= TLB_NOTDIRTY;
  #else
              qatomic_set(&tlb_entry->addr_write,
 -                       tlb_entry->addr_write | TLB_NOTDIRTY);
 +                        tlb_entry->addr_write | TLB_NOTDIRTY);
  #endif
          }
      }
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+-
-index XXXXXXX..XXXXXXX 100644
+-    return false;
---- a/tcg/aarch64/tcg-target.c.inc
++    return 0;
-+++ b/tcg/aarch64/tcg-target.c.inc
+ }
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
-     tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);
+ static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
-     /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */
+         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
-+    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
+     }
-     tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
-                is_ld ? offsetof(CPUTLBEntry, addr_read)
+-    if (fold_setcond_zmask(ctx, op, false)) {
-                      : offsetof(CPUTLBEntry, addr_write));
++    i = fold_setcond_zmask(ctx, op, false);
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++    if (i > 0) {
-index XXXXXXX..XXXXXXX 100644
+         return true;
---- a/tcg/arm/tcg-target.c.inc
+     }
-+++ b/tcg/arm/tcg-target.c.inc
+-    fold_setcond_tst_pow2(ctx, op, false);
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
++    if (i == 0) {
-      * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
++        fold_setcond_tst_pow2(ctx, op, false);
       * Load the tlb comparator into R2/R3 and the fast path addend into R1.
       */
 +    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
      if (cmp_off == 0) {
          if (s->addr_type == TCG_TYPE_I32) {
              tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
 diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.c.inc
 +++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      tcg_out_opc_add_d(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1);
      /* Load the tlb comparator and the addend.  */
 +    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
      tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2,
                 is_ld ? offsetof(CPUTLBEntry, addr_read)
                       : offsetof(CPUTLBEntry, addr_write));
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      /* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3.  */
      tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1);
 +    if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) {
 +        /* Load the (low half) tlb comparator.  */
 +        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3,
 +                   cmp_off + HOST_BIG_ENDIAN * 4);
 +    } else {
 +        tcg_out_ld(s, TCG_TYPE_I64, TCG_TMP0, TCG_TMP3, cmp_off);
 +    }
-+
-     if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) {
+     ctx->z_mask = 1;
--        /* Load the tlb comparator.  */
+     return false;
--        tcg_out_ld(s, addr_type, TCG_TMP0, TCG_TMP3, cmp_off);
+@@ -XXX,XX +XXX,XX @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
-         /* Load the tlb addend for the fast path.  */
+         return tcg_opt_gen_movi(ctx, op, op->args[0], -i);
          tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off);
 -    } else {
 -        /* Load the low half of the tlb comparator.  */
 -        tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
      }
-     /*
+-    if (fold_setcond_zmask(ctx, op, true)) {
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
++    i = fold_setcond_zmask(ctx, op, true);
-index XXXXXXX..XXXXXXX 100644
++    if (i > 0) {
---- a/tcg/ppc/tcg-target.c.inc
+         return true;
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      }
-     tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0));
+-    fold_setcond_tst_pow2(ctx, op, true);
++    if (i == 0) {
--    /* Load the (low part) TLB comparator into TMP2.  */
++        fold_setcond_tst_pow2(ctx, op, true);
--    if (cmp_off == 0
++    }
--        && (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32)) {
--        uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32
+     /* Value is {0,-1} so all bits are repetitions of the sign. */
--                        ? LWZUX : LDUX);
+     ctx->s_mask = -1;
 -        tcg_out32(s, lxu | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
 +    /*
 +     * Load the (low part) TLB comparator into TMP2.
 +     * For 64-bit host, always load the entire 64-bit slot for simplicity.
 +     * We will ignore the high bits with tcg_out_cmp(..., addr_type).
 +     */
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        if (cmp_off == 0) {
 +            tcg_out32(s, LDUX | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
 +        } else {
 +            tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
 +            tcg_out_ld(s, TCG_TYPE_I64, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
 +        }
 +    } else if (cmp_off == 0 && !HOST_BIG_ENDIAN) {
 +        tcg_out32(s, LWZUX | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
      } else {
          tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
 -        if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) {
 -            tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2,
 -                       TCG_REG_TMP1, cmp_off + 4 * HOST_BIG_ENDIAN);
 -        } else {
 -            tcg_out_ld(s, addr_type, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
 -        }
 +        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2, TCG_REG_TMP1,
 +                   cmp_off + 4 * HOST_BIG_ENDIAN);
      }
      /*
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
      }
      /* Load the tlb comparator and the addend.  */
 +    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
      tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2,
                 is_ld ? offsetof(CPUTLBEntry, addr_read)
                       : offsetof(CPUTLBEntry, addr_write));
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
          ofs = offsetof(CPUTLBEntry, addr_write);
      }
      if (addr_type == TCG_TYPE_I32) {
 +        ofs += HOST_BIG_ENDIAN * 4;
          tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
      } else {
          tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
 diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc64/tcg-target.c.inc
 +++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      /* Add the tlb_table pointer, creating the CPUTLBEntry address into R2.  */
      tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T3, ARITH_ADD);
 -    /* Load the tlb comparator and the addend. */
 -    tcg_out_ld(s, addr_type, TCG_REG_T2, TCG_REG_T1, cmp_off);
 +    /*
 +     * Load the tlb comparator and the addend.
 +     * Always load the entire 64-bit comparator for simplicity.
 +     * We will ignore the high bits via BPCC_ICC below.
 +     */
 +    tcg_out_ld(s, TCG_TYPE_I64, TCG_REG_T2, TCG_REG_T1, cmp_off);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T1, TCG_REG_T1, add_off);
      h->base = TCG_REG_T1;
 --
-.34.1
+.43.0

-[PULL 49/52] exec/poison: Do not poison CONFIG_SOFTMMU
+[PULL 37/72] tcg/optimize: Use fold_masks_z in fold_setcond
-If CONFIG_USER_ONLY is ok generically, so is CONFIG_SOFTMMU,
+Avoid the use of the OptContext slots.
 because they are exactly opposite.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/poison.h         | 1 -
+ tcg/optimize.c | 3 +--
- scripts/make-config-poison.sh | 5 +++--
+file changed, 1 insertion(+), 2 deletions(-)
 files changed, 3 insertions(+), 3 deletions(-)
-diff --git a/include/exec/poison.h b/include/exec/poison.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/poison.h
+--- a/tcg/optimize.c
-+++ b/include/exec/poison.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
- #pragma GCC poison CONFIG_HVF
+         fold_setcond_tst_pow2(ctx, op, false);
- #pragma GCC poison CONFIG_LINUX_USER
+     }
- #pragma GCC poison CONFIG_KVM
--#pragma GCC poison CONFIG_SOFTMMU
+-    ctx->z_mask = 1;
- #pragma GCC poison CONFIG_WHPX
+-    return false;
- #pragma GCC poison CONFIG_XEN
++    return fold_masks_z(ctx, op, 1);
+ }
-diff --git a/scripts/make-config-poison.sh b/scripts/make-config-poison.sh
-index XXXXXXX..XXXXXXX 100755
+ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
 --- a/scripts/make-config-poison.sh
 +++ b/scripts/make-config-poison.sh
@@ -XXX,XX +XXX,XX @@ if test $# = 0; then
    exit 0
  fi
 -# Create list of config switches that should be poisoned in common code...
 -# but filter out CONFIG_TCG and CONFIG_USER_ONLY which are special.
 +# Create list of config switches that should be poisoned in common code,
 +# but filter out several which are handled manually.
  exec sed -n \
    -e' /CONFIG_TCG/d' \
    -e '/CONFIG_USER_ONLY/d' \
 +  -e '/CONFIG_SOFTMMU/d' \
    -e '/^#define / {' \
    -e    's///' \
    -e    's/ .*//' \
 --
-.34.1
+.43.0

-[PULL 48/52] plugins: Drop unused headers from exec/plugin-gen.h
+[PULL 38/72] tcg/optimize: Use fold_masks_s in fold_negsetcond
-Two headers are not required for the rest of the
+Avoid the use of the OptContext slots.
 contents of plugin-gen.h.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/plugin-gen.h | 2 --
+ tcg/optimize.c | 3 +--
-file changed, 2 deletions(-)
+file changed, 1 insertion(+), 2 deletions(-)
-diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/plugin-gen.h
+--- a/tcg/optimize.c
-+++ b/include/exec/plugin-gen.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
- #ifndef QEMU_PLUGIN_GEN_H
+     }
- #define QEMU_PLUGIN_GEN_H
+     /* Value is {0,-1} so all bits are repetitions of the sign. */
--#include "exec/cpu_ldst.h"
+-    ctx->s_mask = -1;
--#include "qemu/plugin.h"
+-    return false;
- #include "tcg/tcg.h"
++    return fold_masks_s(ctx, op, -1);
+ }
- struct DisasContextBase;
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 --
-.34.1
+.43.0

-[PULL 44/52] tcg: Fix PAGE/PROT confusion
+[PULL 39/72] tcg/optimize: Use fold_masks_z in fold_setcond2
-The bug was hidden because they happen to have the same values.
+Avoid the use of the OptContext slots.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/region.c | 18 +++++++++++++-----
+ tcg/optimize.c | 3 +--
-file changed, 13 insertions(+), 5 deletions(-)
+file changed, 1 insertion(+), 2 deletions(-)
-diff --git a/tcg/region.c b/tcg/region.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/region.c
+--- a/tcg/optimize.c
-+++ b/tcg/region.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static int alloc_code_gen_buffer(size_t tb_size, int splitwx, Error **errp)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-     return PROT_READ | PROT_WRITE;
+         return fold_setcond(ctx, op);
  }
  #elif defined(_WIN32)
 +/*
 + * Local source-level compatibility with Unix.
 + * Used by tcg_region_init below.
 + */
 +#define PROT_READ   1
 +#define PROT_WRITE  2
 +#define PROT_EXEC   4
 +
  static int alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
  {
      void *buf;
@@ -XXX,XX +XXX,XX @@ static int alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
      region.start_aligned = buf;
      region.total_size = size;
 -    return PAGE_READ | PAGE_WRITE | PAGE_EXEC;
 +    return PROT_READ | PROT_WRITE | PROT_EXEC;
  }
  #else
  static int alloc_code_gen_buffer_anon(size_t size, int prot,
@@ -XXX,XX +XXX,XX @@ void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
       * buffer -- let that one use hugepages throughout.
       * Work with the page protections set up with the initial mapping.
       */
 -    need_prot = PAGE_READ | PAGE_WRITE;
 +    need_prot = PROT_READ | PROT_WRITE;
  #ifndef CONFIG_TCG_INTERPRETER
      if (tcg_splitwx_diff == 0) {
 -        need_prot |= PAGE_EXEC;
 +        need_prot |= PROT_EXEC;
      }
- #endif
-     for (size_t i = 0, n = region.n; i < n; i++) {
+-    ctx->z_mask = 1;
-@@ -XXX,XX +XXX,XX @@ void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
+-    return false;
-         if (have_prot != need_prot) {
++    return fold_masks_z(ctx, op, 1);
-             int rc;
+  do_setcond_const:
--            if (need_prot == (PAGE_READ | PAGE_WRITE | PAGE_EXEC)) {
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +            if (need_prot == (PROT_READ | PROT_WRITE | PROT_EXEC)) {
                  rc = qemu_mprotect_rwx(start, end - start);
 -            } else if (need_prot == (PAGE_READ | PAGE_WRITE)) {
 +            } else if (need_prot == (PROT_READ | PROT_WRITE)) {
                  rc = qemu_mprotect_rw(start, end - start);
              } else {
                  g_assert_not_reached();
 --
-.34.1
+.43.0

-[PULL 42/52] target/arm: Add missing include of exec/exec-all.h
+[PULL 40/72] tcg/optimize: Use finish_folding in fold_cmp_vec
-This had been pulled in via exec/translator.h,
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 but the include of exec-all.h will be removed.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/tcg/translate.h | 1 +
+ tcg/optimize.c | 2 +-
-file changed, 1 insertion(+)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/tcg/translate.h
+--- a/tcg/optimize.c
-+++ b/target/arm/tcg/translate.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
- #include "cpu.h"
+     if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
- #include "tcg/tcg-op.h"
+         op->args[3] = tcg_swap_cond(op->args[3]);
- #include "tcg/tcg-op-gvec.h"
+     }
-+#include "exec/exec-all.h"
+-    return false;
- #include "exec/translator.h"
++    return finish_folding(ctx, op);
- #include "exec/helper-gen.h"
+ }
- #include "internals.h"
  static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
 --
-.34.1
+.43.0

-[PULL 41/52] target/*: Add missing includes of exec/translation-block.h
+[PULL 41/72] tcg/optimize: Use finish_folding in fold_cmpsel_vec
-This had been pulled in via exec/exec-all.h, via exec/translator.h,
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 but the include of exec-all.h will be removed.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/hexagon/translate.c   | 1 +
+ tcg/optimize.c | 2 +-
- target/loongarch/translate.c | 3 +--
+file changed, 1 insertion(+), 1 deletion(-)
  target/mips/tcg/translate.c  | 1 +
 files changed, 3 insertions(+), 2 deletions(-)
-diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/translate.c
+--- a/tcg/optimize.c
-+++ b/target/hexagon/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
- #include "tcg/tcg-op-gvec.h"
+     if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
- #include "exec/helper-gen.h"
+         op->args[5] = tcg_invert_cond(op->args[5]);
- #include "exec/helper-proto.h"
+     }
-+#include "exec/translation-block.h"
+-    return false;
- #include "exec/cpu_ldst.h"
++    return finish_folding(ctx, op);
- #include "exec/log.h"
+ }
- #include "internal.h"
-diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/translate.c
 +++ b/target/loongarch/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "cpu.h"
  #include "tcg/tcg-op.h"
  #include "tcg/tcg-op-gvec.h"
 -
 +#include "exec/translation-block.h"
  #include "exec/translator.h"
  #include "exec/helper-proto.h"
  #include "exec/helper-gen.h"
 -
  #include "exec/log.h"
  #include "qemu/qemu-print.h"
  #include "fpu/softfloat.h"
 diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/translate.c
 +++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "translate.h"
  #include "internal.h"
  #include "exec/helper-proto.h"
 +#include "exec/translation-block.h"
  #include "semihosting/semihost.h"
  #include "trace.h"
  #include "disas/disas.h"
 --
-.34.1
+.43.0

-[PULL 02/52] tcg/riscv: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+[PULL 42/72] tcg/optimize: Use fold_masks_zs in fold_sextract
-All uses replaced with TCGContext.addr_type.
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/riscv/tcg-target.c.inc | 13 +++++++------
+ tcg/optimize.c | 24 +++++++++---------------
-file changed, 7 insertions(+), 6 deletions(-)
+file changed, 9 insertions(+), 15 deletions(-)
-diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/riscv/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/riscv/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
+@@ -XXX,XX +XXX,XX @@ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
-                                            TCGReg addr_reg, MemOpIdx oi,
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
                                             bool is_ld)
  {
-+    TCGType addr_type = s->addr_type;
+     uint64_t z_mask, s_mask, s_mask_old;
-     TCGLabelQemuLdst *ldst = NULL;
++    TempOptInfo *t1 = arg_info(op->args[1]);
-     MemOp opc = get_memop(oi);
+     int pos = op->args[2];
-     TCGAtomAlign aa;
+     int len = op->args[3];
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
-     addr_adj = addr_reg;
+-    if (arg_is_const(op->args[1])) {
-     if (a_mask < s_mask) {
+-        uint64_t t;
-         addr_adj = TCG_REG_TMP0;
+-
--        tcg_out_opc_imm(s, TARGET_LONG_BITS == 32 ? OPC_ADDIW : OPC_ADDI,
+-        t = arg_info(op->args[1])->val;
-+        tcg_out_opc_imm(s, addr_type == TCG_TYPE_I32 ? OPC_ADDIW : OPC_ADDI,
+-        t = sextract64(t, pos, len);
-                         addr_adj, addr_reg, s_mask - a_mask);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    if (ti_is_const(t1)) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0],
 +                                sextract64(ti_const_val(t1), pos, len));
      }
-     compare_mask = s->page_mask | a_mask;
-     if (compare_mask == sextreg(compare_mask, 0, 12)) {
+-    z_mask = arg_info(op->args[1])->z_mask;
-         tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_adj, compare_mask);
+-    z_mask = sextract64(z_mask, pos, len);
-     } else {
+-    ctx->z_mask = z_mask;
--        tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask);
+-
-+        tcg_out_movi(s, addr_type, TCG_REG_TMP1, compare_mask);
+-    s_mask_old = arg_info(op->args[1])->s_mask;
-         tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_adj);
+-    s_mask = sextract64(s_mask_old, pos, len);
 -    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
 -    ctx->s_mask = s_mask;
 +    s_mask_old = t1->s_mask;
 +    s_mask = s_mask_old >> pos;
 +    s_mask |= -1ull << (len - 1);
      if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
          return true;
      }
-     /* Load the tlb comparator and the addend.  */
+-    return fold_masks(ctx, op);
--    tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2,
++    z_mask = sextract64(t1->z_mask, pos, len);
-+    tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2,
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
-                is_ld ? offsetof(CPUTLBEntry, addr_read)
+ }
-                      : offsetof(CPUTLBEntry, addr_write));
-     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2,
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
      tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0);
      /* TLB Hit - translate address using addend.  */
 -    if (TARGET_LONG_BITS == 64) {
 +    if (addr_type != TCG_TYPE_I32) {
          tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2);
      } else if (have_zba) {
          tcg_out_opc_reg(s, OPC_ADD_UW, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
      if (guest_base != 0) {
          base = TCG_REG_TMP0;
 -        if (TARGET_LONG_BITS == 64) {
 +        if (addr_type != TCG_TYPE_I32) {
              tcg_out_opc_reg(s, OPC_ADD, base, addr_reg, TCG_GUEST_BASE_REG);
          } else if (have_zba) {
              tcg_out_opc_reg(s, OPC_ADD_UW, base, addr_reg, TCG_GUEST_BASE_REG);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
              tcg_out_ext32u(s, base, addr_reg);
              tcg_out_opc_reg(s, OPC_ADD, base, base, TCG_GUEST_BASE_REG);
          }
 -    } else if (TARGET_LONG_BITS == 64) {
 +    } else if (addr_type != TCG_TYPE_I32) {
          base = addr_reg;
      } else {
          base = TCG_REG_TMP0;
 --
-.34.1
+.43.0

-[PULL 03/52] tcg/s390x: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+[PULL 43/72] tcg/optimize: Use fold_masks_zs, fold_masks_s in fold_shift
-All uses replaced with TCGContext.addr_type.
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/s390x/tcg-target.c.inc | 9 +++++----
+ tcg/optimize.c | 27 ++++++++++++++-------------
-file changed, 5 insertions(+), 4 deletions(-)
+file changed, 14 insertions(+), 13 deletions(-)
-diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/s390x/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/s390x/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-                                            TCGReg addr_reg, MemOpIdx oi,
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
                                             bool is_ld)
  {
-+    TCGType addr_type = s->addr_type;
+     uint64_t s_mask, z_mask, sign;
-     TCGLabelQemuLdst *ldst = NULL;
++    TempOptInfo *t1, *t2;
-     MemOp opc = get_memop(oi);
-     MemOp s_bits = opc & MO_SIZE;
+     if (fold_const2(ctx, op) ||
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+         fold_ix_to_i(ctx, op, 0) ||
-         tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask);
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-     } else {
+         return true;
          tcg_out_insn(s, RX, LA, TCG_REG_R0, addr_reg, TCG_REG_NONE, a_off);
 -        tgen_andi(s, TCG_TYPE_TL, TCG_REG_R0, tlb_mask);
 +        tgen_andi(s, addr_type, TCG_REG_R0, tlb_mask);
      }
-     if (is_ld) {
+-    s_mask = arg_info(op->args[1])->s_mask;
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+-    z_mask = arg_info(op->args[1])->z_mask;
-     } else {
++    t1 = arg_info(op->args[1]);
-         ofs = offsetof(CPUTLBEntry, addr_write);
++    t2 = arg_info(op->args[2]);
 +    s_mask = t1->s_mask;
 +    z_mask = t1->z_mask;
 -    if (arg_is_const(op->args[2])) {
 -        int sh = arg_info(op->args[2])->val;
 -
 -        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 +    if (ti_is_const(t2)) {
 +        int sh = ti_const_val(t2);
 +        z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
          s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 -        return fold_masks(ctx, op);
 +        return fold_masks_zs(ctx, op, z_mask, s_mask);
      }
--    if (TARGET_LONG_BITS == 32) {
-+    if (addr_type == TCG_TYPE_I32) {
+     switch (op->opc) {
-         tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-     } else {
+          * Arithmetic right shift will not reduce the number of
-         tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
+          * input sign repetitions.
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+          */
-     tcg_out_insn(s, RXY, LG, h->index, TCG_TMP0, TCG_REG_NONE,
+-        ctx->s_mask = s_mask;
-                  offsetof(CPUTLBEntry, addend));
+-        break;
++        return fold_masks_s(ctx, op, s_mask);
--    if (TARGET_LONG_BITS == 32) {
+     CASE_OP_32_64(shr):
-+    if (addr_type == TCG_TYPE_I32) {
+         /*
-         tcg_out_insn(s, RRE, ALGFR, h->index, addr_reg);
+          * If the sign bit is known zero, then logical right shift
-         h->base = TCG_REG_NONE;
+-         * will not reduced the number of input sign repetitions.
-     } else {
++         * will not reduce the number of input sign repetitions.
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+          */
 -        sign = (s_mask & -s_mask) >> 1;
 +        sign = -s_mask;
          if (sign && !(z_mask & sign)) {
 -            ctx->s_mask = s_mask;
 +            return fold_masks_s(ctx, op, s_mask);
          }
          break;
      default:
          break;
      }
-     h->base = addr_reg;
+-    return false;
--    if (TARGET_LONG_BITS == 32) {
++    return finish_folding(ctx, op);
-+    if (addr_type == TCG_TYPE_I32) {
+ }
-         tcg_out_ext32u(s, TCG_TMP0, addr_reg);
-         h->base = TCG_TMP0;
+ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
      }
 --
-.34.1
+.43.0

-[PULL 39/52] target/arm: Tidy helpers for translation
+[PULL 44/72] tcg/optimize: Simplify sign bit test in fold_shift
-Move most includes from *translate*.c to translate.h, ensuring
+Merge the two conditions, sign != 0 && !(z_mask & sign),
-that we get the ordering correct.  Ensure cpu.h is first.
+by testing ~z_mask & sign.   If sign == 0, the logical and
-Use disas/disas.h instead of exec/log.h.
+will produce false.
 Drop otherwise unused includes.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/tcg/translate.h        |  3 +++
+ tcg/optimize.c | 5 ++---
- target/arm/tcg/translate-a64.c    | 17 +++++------------
+file changed, 2 insertions(+), 3 deletions(-)
  target/arm/tcg/translate-m-nocp.c |  2 --
  target/arm/tcg/translate-mve.c    |  3 ---
  target/arm/tcg/translate-neon.c   |  3 ---
  target/arm/tcg/translate-sme.c    |  6 ------
  target/arm/tcg/translate-sve.c    |  9 ---------
  target/arm/tcg/translate-vfp.c    |  3 ---
  target/arm/tcg/translate.c        | 17 +++++------------
 files changed, 13 insertions(+), 50 deletions(-)
-diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/tcg/translate.h
+--- a/tcg/optimize.c
-+++ b/target/arm/tcg/translate.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
- #ifndef TARGET_ARM_TRANSLATE_H
- #define TARGET_ARM_TRANSLATE_H
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
-+#include "cpu.h"
+-    uint64_t s_mask, z_mask, sign;
-+#include "tcg/tcg-op.h"
++    uint64_t s_mask, z_mask;
-+#include "tcg/tcg-op-gvec.h"
+     TempOptInfo *t1, *t2;
- #include "exec/translator.h"
- #include "exec/helper-gen.h"
+     if (fold_const2(ctx, op) ||
- #include "internals.h"
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
+          * If the sign bit is known zero, then logical right shift
-index XXXXXXX..XXXXXXX 100644
+          * will not reduce the number of input sign repetitions.
---- a/target/arm/tcg/translate-a64.c
+          */
-+++ b/target/arm/tcg/translate-a64.c
+-        sign = -s_mask;
-@@ -XXX,XX +XXX,XX @@
+-        if (sign && !(z_mask & sign)) {
-  */
++        if (~z_mask & -s_mask) {
- #include "qemu/osdep.h"
+             return fold_masks_s(ctx, op, s_mask);
+         }
--#include "cpu.h"
+         break;
 -#include "exec/exec-all.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "qemu/log.h"
 -#include "arm_ldst.h"
  #include "translate.h"
 -#include "internals.h"
 -#include "qemu/host-utils.h"
 -#include "semihosting/semihost.h"
 -#include "exec/log.h"
 -#include "cpregs.h"
  #include "translate-a64.h"
 -#include "qemu/atomic128.h"
 +#include "qemu/log.h"
 +#include "disas/disas.h"
 +#include "arm_ldst.h"
 +#include "semihosting/semihost.h"
 +#include "cpregs.h"
  static TCGv_i64 cpu_X[32];
  static TCGv_i64 cpu_pc;
 diff --git a/target/arm/tcg/translate-m-nocp.c b/target/arm/tcg/translate-m-nocp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-m-nocp.c
 +++ b/target/arm/tcg/translate-m-nocp.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
  #include "translate.h"
  #include "translate-a32.h"
 diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-mve.c
 +++ b/target/arm/tcg/translate-mve.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "exec/exec-all.h"
  #include "translate.h"
  #include "translate-a32.h"
 diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-neon.c
 +++ b/target/arm/tcg/translate-neon.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "exec/exec-all.h"
  #include "translate.h"
  #include "translate-a32.h"
 diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-sme.c
 +++ b/target/arm/tcg/translate-sme.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "cpu.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "tcg/tcg-gvec-desc.h"
  #include "translate.h"
  #include "translate-a64.h"
 -#include "fpu/softfloat.h"
 -
  /*
   * Include the generated decoder.
 diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-sve.c
 +++ b/target/arm/tcg/translate-sve.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "cpu.h"
 -#include "exec/exec-all.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "tcg/tcg-gvec-desc.h"
 -#include "qemu/log.h"
 -#include "arm_ldst.h"
  #include "translate.h"
 -#include "internals.h"
 -#include "exec/log.h"
  #include "translate-a64.h"
  #include "fpu/softfloat.h"
 diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate-vfp.c
 +++ b/target/arm/tcg/translate-vfp.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "exec/exec-all.h"
  #include "translate.h"
  #include "translate-a32.h"
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "cpu.h"
 -#include "internals.h"
 -#include "disas/disas.h"
 -#include "exec/exec-all.h"
 -#include "tcg/tcg-op.h"
 -#include "tcg/tcg-op-gvec.h"
 -#include "qemu/log.h"
 -#include "qemu/bitops.h"
 -#include "arm_ldst.h"
 -#include "semihosting/semihost.h"
 -#include "exec/log.h"
 -#include "cpregs.h"
  #include "translate.h"
  #include "translate-a32.h"
 +#include "qemu/log.h"
 +#include "disas/disas.h"
 +#include "arm_ldst.h"
 +#include "semihosting/semihost.h"
 +#include "cpregs.h"
  #include "exec/helper-proto.h"
  #define HELPER_H "helper.h"
 --
-.34.1
+.43.0

-[PULL 51/52] accel/tcg: Unmap perf_marker
+[PULL 45/72] tcg/optimize: Use finish_folding in fold_sub, fold_sub_vec
-From: Ilya Leoshkevich <iii@linux.ibm.com>
+Duplicate fold_sub_vec into fold_sub instead of calling it,
 now that fold_sub_vec always returns true.
-Coverity complains that perf_marker is never unmapped.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Fix by unmapping it in perf_exit().
 Fixes: Coverity CID 1507929
 Fixes: 5584e2dbe8c9 ("tcg: add perfmap and jitdump")
 Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
 Message-Id: <20230605114134.1169974-1-iii@linux.ibm.com>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/perf.c | 11 +++++++++--
+ tcg/optimize.c | 9 ++++++---
-file changed, 9 insertions(+), 2 deletions(-)
+file changed, 6 insertions(+), 3 deletions(-)
-diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/perf.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/perf.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void write_perfmap_entry(const void *start, size_t insn,
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
          fold_sub_to_neg(ctx, op)) {
          return true;
      }
 -    return false;
 +    return finish_folding(ctx, op);
  }
- static FILE *jitdump;
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
 +static size_t perf_marker_size;
 +static void *perf_marker = MAP_FAILED;
  #define JITHEADER_MAGIC 0x4A695444
  #define JITHEADER_VERSION 1
@@ -XXX,XX +XXX,XX @@ void perf_enable_jitdump(void)
  {
-     struct jitheader header;
+-    if (fold_const2(ctx, op) || fold_sub_vec(ctx, op)) {
-     char jitdump_file[32];
++    if (fold_const2(ctx, op) ||
--    void *perf_marker;
++        fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
-     if (!use_rt_clock) {
++        fold_sub_to_neg(ctx, op)) {
-         warn_report("CLOCK_MONOTONIC is not available, proceeding without jitdump");
+         return true;
@@ -XXX,XX +XXX,XX @@ void perf_enable_jitdump(void)
       * PERF_RECORD_MMAP or PERF_RECORD_MMAP2 event is of the form jit-%d.dump
       * and will process it as a jitdump file.
       */
 -    perf_marker = mmap(NULL, qemu_real_host_page_size(), PROT_READ | PROT_EXEC,
 +    perf_marker_size = qemu_real_host_page_size();
 +    perf_marker = mmap(NULL, perf_marker_size, PROT_READ | PROT_EXEC,
                         MAP_PRIVATE, fileno(jitdump), 0);
      if (perf_marker == MAP_FAILED) {
          warn_report("Could not map %s: %s, proceeding without jitdump",
@@ -XXX,XX +XXX,XX @@ void perf_exit(void)
          perfmap = NULL;
      }
-+    if (perf_marker != MAP_FAILED) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
-+        munmap(perf_marker, perf_marker_size);
+                    ? INDEX_op_add_i32 : INDEX_op_add_i64);
-+        perf_marker = MAP_FAILED;
+         op->args[2] = arg_new_constant(ctx, -val);
-+    }
+     }
-+
+-    return false;
-     if (jitdump) {
++    return finish_folding(ctx, op);
-         fclose(jitdump);
+ }
-         jitdump = NULL;
  static bool fold_sub2(OptContext *ctx, TCGOp *op)
 --
-.34.1
+.43.0

-[PULL 30/52] tcg: Remove NO_CPU_IO_DEFS
+[PULL 46/72] tcg/optimize: Use fold_masks_zs in fold_tcg_ld
-From this remove, it's no longer clear what this is attempting
+Avoid the use of the OptContext slots.
 to protect.  The last time a use of this define was added to
 the source tree, as opposed to merely moved around, was 2008.
 There have been many cleanups since that time and this is
 no longer required for the build to succeed.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/cpu.h          | 2 --
+ tcg/optimize.c | 16 +++++++++-------
- target/sparc/cpu.h        | 2 --
+file changed, 9 insertions(+), 7 deletions(-)
  accel/tcg/translate-all.c | 1 -
  tcg/tcg.c                 | 6 ------
 files changed, 11 deletions(-)
-diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/cpu.h
+--- a/tcg/optimize.c
-+++ b/target/ppc/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void ppc_store_msr(CPUPPCState *env, target_ulong value);
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2(OptContext *ctx, TCGOp *op)
- void ppc_cpu_list(void);
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
- /* Time-base and decrementer management */
+ {
--#ifndef NO_CPU_IO_DEFS
++    uint64_t z_mask = -1, s_mask = 0;
- uint64_t cpu_ppc_load_tbl(CPUPPCState *env);
++
- uint32_t cpu_ppc_load_tbu(CPUPPCState *env);
+     /* We can't do any folding with a load, but we can record bits. */
- void cpu_ppc_store_tbu(CPUPPCState *env, uint32_t value);
+     switch (op->opc) {
-@@ -XXX,XX +XXX,XX @@ int ppcemb_tlb_check(CPUPPCState *env, ppcemb_tlb_t *tlb,
+     CASE_OP_32_64(ld8s):
- hwaddr booke206_tlb_to_page_size(CPUPPCState *env,
+-        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
-                                         ppcmas_tlb_t *tlb);
++        s_mask = INT8_MIN;
- #endif
+         break;
--#endif
+     CASE_OP_32_64(ld8u):
+-        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
- void ppc_store_fpscr(CPUPPCState *env, target_ulong val);
++        z_mask = MAKE_64BIT_MASK(0, 8);
- void helper_hfscr_facility_check(CPUPPCState *env, uint32_t bit,
+         break;
-diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
+     CASE_OP_32_64(ld16s):
-index XXXXXXX..XXXXXXX 100644
+-        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
---- a/target/sparc/cpu.h
++        s_mask = INT16_MIN;
-+++ b/target/sparc/cpu.h
+         break;
-@@ -XXX,XX +XXX,XX @@ G_NORETURN void sparc_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
+     CASE_OP_32_64(ld16u):
-                                               uintptr_t retaddr);
+-        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
- G_NORETURN void cpu_raise_exception_ra(CPUSPARCState *, int, uintptr_t);
++        z_mask = MAKE_64BIT_MASK(0, 16);
+         break;
--#ifndef NO_CPU_IO_DEFS
+     case INDEX_op_ld32s_i64:
- /* cpu_init.c */
+-        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
- void cpu_sparc_set_id(CPUSPARCState *env, unsigned int cpu);
++        s_mask = INT32_MIN;
- void sparc_cpu_list(void);
+         break;
-@@ -XXX,XX +XXX,XX @@ static inline int tlb_compare_context(const SparcTLBEntry *tlb,
+     case INDEX_op_ld32u_i64:
-     return compare_masked(context, tlb->tag, MMU_CONTEXT_MASK);
+-        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        z_mask = MAKE_64BIT_MASK(0, 32);
          break;
      default:
          g_assert_not_reached();
      }
 -    return false;
 +    return fold_masks_zs(ctx, op, z_mask, s_mask);
  }
--#endif
+ static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
  #endif
  /* cpu-exec.c */
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
 -#define NO_CPU_IO_DEFS
  #include "trace.h"
  #include "disas/disas.h"
  #include "exec/exec-all.h"
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/cacheflush.h"
  #include "qemu/cacheinfo.h"
  #include "qemu/timer.h"
 -
 -/* Note: the long term plan is to reduce the dependencies on the QEMU
 -   CPU definitions. Currently they are used for qemu_ld/st
 -   instructions */
 -#define NO_CPU_IO_DEFS
 -
  #include "exec/exec-all.h"
  #include "exec/tlb-common.h"
  #include "tcg/tcg-op-common.h"
 --
-.34.1
+.43.0

-[PULL 34/52] include/exec: Remove CODE_GEN_AVG_BLOCK_SIZE
+[PULL 47/72] tcg/optimize: Use finish_folding in fold_tcg_ld_memcopy
-The last use was removed with 2ac01d6dafab.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Fixes: 2ac01d6dafab ("translate-all: use a binary search tree to track TBs in TBContext")
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 10 ----------
+ tcg/optimize.c | 2 +-
-file changed, 10 deletions(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ int probe_access_full(CPUArchState *env, target_ulong addr, int size,
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
-                       CPUTLBEntryFull **pfull, uintptr_t retaddr);
+     TCGType type;
- #endif
+     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
--/* Estimated block size for TB allocation.  */
+-        return false;
--/* ??? The following is based on a 2015 survey of x86_64 host output.
++        return finish_folding(ctx, op);
--   Better would seem to be some sort of dynamically sized TB array,
+     }
--   adapting to the block sizes actually being produced.  */
--#if defined(CONFIG_SOFTMMU)
+     type = ctx->type;
 -#define CODE_GEN_AVG_BLOCK_SIZE 400
 -#else
 -#define CODE_GEN_AVG_BLOCK_SIZE 150
 -#endif
 -
  /* Hide the qatomic_read to make code a little easier on the eyes */
  static inline uint32_t tb_cflags(const TranslationBlock *tb)
  {
 --
-.34.1
+.43.0

-[PULL 52/52] tcg/tcg-op-vec: Remove left over _link_error() definitions
+[PULL 48/72] tcg/optimize: Use fold_masks_zs in fold_xor
-From: Philippe Mathieu-Daudé <philmd@linaro.org>
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
 Remove fold_masks as the function becomes unused.
-In commit d56fea79f9 ("tcg: Move TCG_{LOW,HIGH} to tcg-internal.h")
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 we replaced the "_link_error" definitions with modern QEMU_ERROR()
 attribute markup. We covered tcg-op.c but forgot to completely
 clean tcg-op-vec.c. Do it now.
 Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Message-Id: <20230605175647.88395-3-philmd@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg-op-vec.c | 11 -----------
+ tcg/optimize.c | 18 ++++++++----------
-file changed, 11 deletions(-)
+file changed, 8 insertions(+), 10 deletions(-)
-diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-vec.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg-op-vec.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
- #include "tcg/tcg-mo.h"
+     return fold_masks_zs(ctx, op, -1, s_mask);
- #include "tcg-internal.h"
+ }
--
+-static bool fold_masks(OptContext *ctx, TCGOp *op)
--/* Reduce the number of ifdefs below.  This assumes that all uses of
+-{
--   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
+-    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
--   the compiler can eliminate.  */
+-}
 -#if TCG_TARGET_REG_BITS == 64
 -extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
 -extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
 -#define TCGV_LOW  TCGV_LOW_link_error
 -#define TCGV_HIGH TCGV_HIGH_link_error
 -#endif
 -
  /*
-  * Vector optional opcode tracking.
+  * An "affected" mask bit is 0 if and only if the result is identical
-  * Except for the basic logical operations (and, or, xor), and
+  * to the first input.  Thus if the entire mask is 0, the operation
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask, s_mask;
 +    TempOptInfo *t1, *t2;
 +
      if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          return true;
      }
 -    ctx->z_mask = arg_info(op->args[1])->z_mask
 -                | arg_info(op->args[2])->z_mask;
 -    ctx->s_mask = arg_info(op->args[1])->s_mask
 -                & arg_info(op->args[2])->s_mask;
 -    return fold_masks(ctx, op);
 +    t1 = arg_info(op->args[1]);
 +    t2 = arg_info(op->args[2]);
 +    z_mask = t1->z_mask | t2->z_mask;
 +    s_mask = t1->s_mask & t2->s_mask;
 +    return fold_masks_zs(ctx, op, z_mask, s_mask);
  }
  static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
 --
-.34.1
+.43.0

-[PULL 26/52] tcg: Add insn_start_words to TCGContext
+[PULL 49/72] tcg/optimize: Use finish_folding in fold_bitsel_vec
-This will enable replacement of TARGET_INSN_START_WORDS in tcg.c.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Split out "tcg/insn-start-words.h" and use it in target/.
 Reviewed-by: Anton Johansson <anjo@rev.ng>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/insn-start-words.h | 17 +++++++++++++++++
+ tcg/optimize.c | 2 +-
- include/tcg/tcg-op.h           |  8 ++++----
+file changed, 1 insertion(+), 1 deletion(-)
  include/tcg/tcg-opc.h          |  6 +++---
  include/tcg/tcg.h              |  9 ++-------
  accel/tcg/perf.c               |  8 ++++++--
  accel/tcg/translate-all.c      | 20 +++++++++++++-------
  target/i386/helper.c           |  2 +-
  target/openrisc/sys_helper.c   |  2 +-
  tcg/tcg.c                      | 16 +++++++++++-----
 files changed, 58 insertions(+), 30 deletions(-)
  create mode 100644 include/tcg/insn-start-words.h
-diff --git a/include/tcg/insn-start-words.h b/include/tcg/insn-start-words.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/tcg/insn-start-words.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define TARGET_INSN_START_WORDS
 + * Copyright (c) 2008 Fabrice Bellard
 + */
 +
 +#ifndef TARGET_INSN_START_WORDS
 +
 +#include "cpu.h"
 +
 +#ifndef TARGET_INSN_START_EXTRA_WORDS
 +# define TARGET_INSN_START_WORDS 1
 +#else
 +# define TARGET_INSN_START_WORDS (1 + TARGET_INSN_START_EXTRA_WORDS)
 +#endif
 +
 +#endif /* TARGET_INSN_START_WORDS */
 diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg-op.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
- # error
+             return fold_orc(ctx, op);
  #endif
 -#if TARGET_INSN_START_WORDS == 1
 +#ifndef TARGET_INSN_START_EXTRA_WORDS
  static inline void tcg_gen_insn_start(target_ulong pc)
  {
      TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 64 / TCG_TARGET_REG_BITS);
      tcg_set_insn_start_param(op, 0, pc);
  }
 -#elif TARGET_INSN_START_WORDS == 2
 +#elif TARGET_INSN_START_EXTRA_WORDS == 1
  static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1)
  {
      TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 2 * 64 / TCG_TARGET_REG_BITS);
      tcg_set_insn_start_param(op, 0, pc);
      tcg_set_insn_start_param(op, 1, a1);
  }
 -#elif TARGET_INSN_START_WORDS == 3
 +#elif TARGET_INSN_START_EXTRA_WORDS == 2
  static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
                                        target_ulong a2)
  {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
      tcg_set_insn_start_param(op, 2, a2);
  }
  #else
 -# error "Unhandled number of operands to insn_start"
 +#error Unhandled TARGET_INSN_START_EXTRA_WORDS value
  #endif
  #if TARGET_LONG_BITS == 32
 diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg-opc.h
 +++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(mulsh_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulsh_i64))
  #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 -/* QEMU specific */
 -DEF(insn_start, 0, 0, DATA64_ARGS * TARGET_INSN_START_WORDS,
 -    TCG_OPF_NOT_PRESENT)
 +/* There are tcg_ctx->insn_start_words here, not just one. */
 +DEF(insn_start, 0, 0, DATA64_ARGS, TCG_OPF_NOT_PRESENT)
 +
  DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
  DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
  DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
  #define TCG_TARGET_HAS_v256             0
  #endif
 -#ifndef TARGET_INSN_START_EXTRA_WORDS
 -# define TARGET_INSN_START_WORDS 1
 -#else
 -# define TARGET_INSN_START_WORDS (1 + TARGET_INSN_START_EXTRA_WORDS)
 -#endif
 -
  typedef enum TCGOpcode {
  #define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name,
  #include "tcg/tcg-opc.h"
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
      uint8_t page_bits;
      uint8_t tlb_dyn_max_bits;
  #endif
 +    uint8_t insn_start_words;
      TCGRegSet reserved_regs;
      intptr_t current_frame_offset;
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
      TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS];
      uint16_t gen_insn_end_off[TCG_MAX_INSNS];
 -    uint64_t gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
 +    uint64_t *gen_insn_data;
      /* Exit to translator on overflow. */
      sigjmp_buf jmp_trans;
 diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/perf.c
 +++ b/accel/tcg/perf.c
@@ -XXX,XX +XXX,XX @@ void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
                        const void *start)
  {
      struct debuginfo_query *q;
 -    size_t insn;
 +    size_t insn, start_words;
 +    uint64_t *gen_insn_data;
      if (!perfmap && !jitdump) {
          return;
@@ -XXX,XX +XXX,XX @@ void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
      debuginfo_lock();
      /* Query debuginfo for each guest instruction. */
 +    gen_insn_data = tcg_ctx->gen_insn_data;
 +    start_words = tcg_ctx->insn_start_words;
 +
      for (insn = 0; insn < tb->icount; insn++) {
          /* FIXME: This replicates the restore_state_to_opc() logic. */
 -        q[insn].address = tcg_ctx->gen_insn_data[insn][0];
 +        q[insn].address = gen_insn_data[insn * start_words + 0];
          if (tb_cflags(tb) & CF_PCREL) {
              q[insn].address |= (guest_pc & TARGET_PAGE_MASK);
          } else {
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
  #include "tb-context.h"
  #include "internal.h"
  #include "perf.h"
 +#include "tcg/insn-start-words.h"
  TBContext tb_ctx;
@@ -XXX,XX +XXX,XX @@ static int64_t decode_sleb128(const uint8_t **pp)
  static int encode_search(TranslationBlock *tb, uint8_t *block)
  {
      uint8_t *highwater = tcg_ctx->code_gen_highwater;
 +    uint64_t *insn_data = tcg_ctx->gen_insn_data;
 +    uint16_t *insn_end_off = tcg_ctx->gen_insn_end_off;
      uint8_t *p = block;
      int i, j, n;
      for (i = 0, n = tb->icount; i < n; ++i) {
 -        uint64_t prev;
 +        uint64_t prev, curr;
          for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
              if (i == 0) {
                  prev = (!(tb_cflags(tb) & CF_PCREL) && j == 0 ? tb->pc : 0);
              } else {
 -                prev = tcg_ctx->gen_insn_data[i - 1][j];
 +                prev = insn_data[(i - 1) * TARGET_INSN_START_WORDS + j];
              }
 -            p = encode_sleb128(p, tcg_ctx->gen_insn_data[i][j] - prev);
 +            curr = insn_data[i * TARGET_INSN_START_WORDS + j];
 +            p = encode_sleb128(p, curr - prev);
          }
 -        prev = (i == 0 ? 0 : tcg_ctx->gen_insn_end_off[i - 1]);
 -        p = encode_sleb128(p, tcg_ctx->gen_insn_end_off[i] - prev);
 +        prev = (i == 0 ? 0 : insn_end_off[i - 1]);
 +        curr = insn_end_off[i];
 +        p = encode_sleb128(p, curr - prev);
          /* Test for (pending) buffer overflow.  The assumption is that any
             one row beginning below the high water mark cannot overrun
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      tcg_ctx->tlb_fast_offset =
          (int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
  #endif
 +    tcg_ctx->insn_start_words = TARGET_INSN_START_WORDS;
   tb_overflow:
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
              fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
              fprintf(logfile,
                      "  -- guest addr 0x%016" PRIx64 " + tb prologue\n",
 -                    tcg_ctx->gen_insn_data[insn][0]);
 +                    tcg_ctx->gen_insn_data[insn * TARGET_INSN_START_WORDS]);
              chunk_start = tcg_ctx->gen_insn_end_off[insn];
              disas(logfile, tb->tc.ptr, chunk_start);
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
                  size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
                  if (chunk_end > chunk_start) {
                      fprintf(logfile, "  -- guest addr 0x%016" PRIx64 "\n",
 -                            tcg_ctx->gen_insn_data[insn][0]);
 +                            tcg_ctx->gen_insn_data[insn * TARGET_INSN_START_WORDS]);
                      disas(logfile, tb->tc.ptr + chunk_start,
                            chunk_end - chunk_start);
                      chunk_start = chunk_end;
 diff --git a/target/i386/helper.c b/target/i386/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/helper.c
 +++ b/target/i386/helper.c
@@ -XXX,XX +XXX,XX @@
  #endif
  #include "qemu/log.h"
  #ifdef CONFIG_TCG
 -#include "tcg/tcg.h"
 +#include "tcg/insn-start-words.h"
  #endif
  void cpu_sync_avx_hflag(CPUX86State *env)
 diff --git a/target/openrisc/sys_helper.c b/target/openrisc/sys_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/sys_helper.c
 +++ b/target/openrisc/sys_helper.c
@@ -XXX,XX +XXX,XX @@
  #ifndef CONFIG_USER_ONLY
  #include "hw/boards.h"
  #endif
 -#include "tcg/tcg.h"
 +#include "tcg/insn-start-words.h"
  #define TO_SPR(group, number) (((group) << 11) + (number))
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
      tcg_debug_assert(s->tlb_fast_offset < 0);
      tcg_debug_assert(s->tlb_fast_offset >= MIN_TLB_MASK_TABLE_OFS);
  #endif
 +
 +    tcg_debug_assert(s->insn_start_words > 0);
  }
  static TCGTemp *tcg_temp_alloc(TCGContext *s)
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
              nb_oargs = 0;
              col += ne_fprintf(f, "\n ----");
 -            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
 +            for (i = 0, k = s->insn_start_words; i < k; ++i) {
                  col += ne_fprintf(f, " %016" PRIx64,
                                    tcg_get_insn_start_param(op, i));
              }
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
  #ifdef CONFIG_PROFILER
      TCGProfile *prof = &s->prof;
  #endif
 -    int i, num_insns;
 +    int i, start_words, num_insns;
      TCGOp *op;
  #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
      s->pool_labels = NULL;
  #endif
 +    start_words = s->insn_start_words;
 +    s->gen_insn_data =
 +        tcg_malloc(sizeof(uint64_t) * s->gen_tb->icount * start_words);
 +
      num_insns = -1;
      QTAILQ_FOREACH(op, &s->ops, link) {
          TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
                  assert(s->gen_insn_end_off[num_insns] == off);
              }
              num_insns++;
 -            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
 -                s->gen_insn_data[num_insns][i] =
 +            for (i = 0; i < start_words; ++i) {
 +                s->gen_insn_data[num_insns * start_words + i] =
                      tcg_get_insn_start_param(op, i);
              }
              break;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
              return -2;
          }
      }
--    tcg_debug_assert(num_insns >= 0);
+-    return false;
-+    tcg_debug_assert(num_insns + 1 == s->gen_tb->icount);
++    return finish_folding(ctx, op);
-     s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
+ }
-     /* Generate TB finalization at the end of block */
+ /* Propagate constants and copies, fold constant expressions. */
 --
-.34.1
+.43.0

-[PULL 32/52] exec-all: Widen TranslationBlock pc and cs_base to 64-bits
+[PULL 50/72] tcg/optimize: Use finish_folding as default in tcg_optimize
-This makes TranslationBlock agnostic to the address size of the guest.
+All non-default cases now finish folding within each function.
-Use vaddr for pc, since that's always a virtual address.
+Do the same with the default case and assert it is done after.
 Use uint64_t for cs_base, since usage varies between guests.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 4 ++--
+ tcg/optimize.c | 6 ++----
- accel/tcg/cpu-exec.c    | 2 +-
+file changed, 2 insertions(+), 4 deletions(-)
 files changed, 3 insertions(+), 3 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-      * Unwind information is taken as offsets from the page, to be
+             done = true;
-      * deposited into the "current" PC.
+             break;
-      */
+         default:
--    target_ulong pc;
++            done = finish_folding(&ctx, op);
-+    vaddr pc;
+             break;
+         }
-     /*
+-
-      * Target-specific data associated with the TranslationBlock, e.g.:
+-        if (!done) {
-@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+-            finish_folding(&ctx, op);
-      * s390x: instruction data for EXECUTE,
+-        }
-      * sparc: the next pc of the instruction queue (for delay slots).
++        tcg_debug_assert(done);
-      */
+     }
--    target_ulong cs_base;
+ }
 +    uint64_t cs_base;
      uint32_t flags; /* flags defining in which context the code was generated */
      uint32_t cflags;    /* compile flags */
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
  {
      if (qemu_log_in_addr_range(pc)) {
          qemu_log_mask(CPU_LOG_EXEC,
 -                      "Trace %d: %p [" TARGET_FMT_lx
 +                      "Trace %d: %p [%08" PRIx64
                        "/" TARGET_FMT_lx "/%08x/%08x] %s\n",
                        cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
                        tb->flags, tb->cflags, lookup_symbol(pc));
 --
-.34.1
+.43.0

-[PULL 27/52] tcg: Add guest_mo to TCGContext
+[PULL 51/72] tcg/optimize: Remove z_mask, s_mask from OptContext
-This replaces of TCG_GUEST_DEFAULT_MO in tcg-op-ldst.c.
+All mask setting is now done with parameters via fold_masks_*.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h         | 1 +
+ tcg/optimize.c | 13 -------------
- accel/tcg/translate-all.c | 5 +++++
+file changed, 13 deletions(-)
  tcg/tcg-op-ldst.c         | 4 +---
 files changed, 7 insertions(+), 3 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct TCGContext {
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-     uint8_t tlb_dyn_max_bits;
+     QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
- #endif
-     uint8_t insn_start_words;
+     /* In flight values from optimization. */
-+    TCGBar guest_mo;
+-    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+-    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
-     TCGRegSet reserved_regs;
+     TCGType type;
-     intptr_t current_frame_offset;
+ } OptContext;
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static bool finish_folding(OptContext *ctx, TCGOp *op)
---- a/accel/tcg/translate-all.c
+     for (i = 0; i < nb_oargs; i++) {
-+++ b/accel/tcg/translate-all.c
+         TCGTemp *ts = arg_temp(op->args[i]);
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+         reset_ts(ctx, ts);
-         (int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
+-        /*
- #endif
+-         * Save the corresponding known-zero/sign bits mask for the
-     tcg_ctx->insn_start_words = TARGET_INSN_START_WORDS;
+-         * first output argument (only one supported so far).
-+#ifdef TCG_GUEST_DEFAULT_MO
+-         */
-+    tcg_ctx->guest_mo = TCG_GUEST_DEFAULT_MO;
+-        if (i == 0) {
-+#else
+-            ts_info(ts)->z_mask = ctx->z_mask;
-+    tcg_ctx->guest_mo = TCG_MO_ALL;
+-        }
-+#endif
+     }
+     return true;
-  tb_overflow:
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
+             ctx.type = TCG_TYPE_I32;
-index XXXXXXX..XXXXXXX 100644
+         }
---- a/tcg/tcg-op-ldst.c
-+++ b/tcg/tcg-op-ldst.c
+-        /* Assume all bits affected, no bits known zero, no sign reps. */
-@@ -XXX,XX +XXX,XX @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 v, TCGTemp *addr, MemOpIdx oi)
+-        ctx.z_mask = -1;
+-        ctx.s_mask = 0;
- static void tcg_gen_req_mo(TCGBar type)
+-
- {
+         /*
--#ifdef TCG_GUEST_DEFAULT_MO
+          * Process each opcode.
--    type &= TCG_GUEST_DEFAULT_MO;
+          * Sorted alphabetically by opcode as much as possible.
 -#endif
 +    type &= tcg_ctx->guest_mo;
      type &= ~TCG_TARGET_DEFAULT_MO;
      if (type) {
          tcg_gen_mb(type | TCG_BAR_SC);
 --
-.34.1
+.43.0

-[PULL 15/52] tcg: Split tcg/tcg-op-common.h from tcg/tcg-op.h
+[PULL 52/72] tcg/optimize: Re-enable sign-mask optimizations
-Create tcg/tcg-op-common.h, moving everything that does not concern
+All instances of s_mask have been converted to the new
-TARGET_LONG_BITS or TCGv.  Adjust tcg/*.c to use the new header
+representation.  We can now re-enable usage.
 instead of tcg-op.h, in preparation for compiling tcg/ only once.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op-common.h |  996 ++++++++++++++++++++++++++++++++++
+ tcg/optimize.c | 4 ++--
- include/tcg/tcg-op.h        | 1004 +----------------------------------
+file changed, 2 insertions(+), 2 deletions(-)
  tcg/optimize.c              |    2 +-
  tcg/tcg-op-gvec.c           |    2 +-
  tcg/tcg-op-ldst.c           |    2 +-
  tcg/tcg-op-vec.c            |    2 +-
  tcg/tcg-op.c                |    2 +-
  tcg/tcg.c                   |    2 +-
  tcg/tci.c                   |    3 +-
 files changed, 1007 insertions(+), 1008 deletions(-)
  create mode 100644 include/tcg/tcg-op-common.h
-diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
-new file mode 100644
-index XXXXXXX..XXXXXXX
---- /dev/null
-+++ b/include/tcg/tcg-op-common.h
-@@ -XXX,XX +XXX,XX @@
-+/* SPDX-License-Identifier: MIT */
-+/*
-+ * Target independent opcode generation functions.
-+ *
-+ * Copyright (c) 2008 Fabrice Bellard
-+ */
-+
-+#ifndef TCG_TCG_OP_COMMON_H
-+#define TCG_TCG_OP_COMMON_H
-+
-+#include "tcg/tcg.h"
-+#include "exec/helper-proto.h"
-+#include "exec/helper-gen.h"
-+
-+/* Basic output routines.  Not for general consumption.  */
-+
-+void tcg_gen_op1(TCGOpcode, TCGArg);
-+void tcg_gen_op2(TCGOpcode, TCGArg, TCGArg);
-+void tcg_gen_op3(TCGOpcode, TCGArg, TCGArg, TCGArg);
-+void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
-+void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
-+void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
-+
-+void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
-+void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
-+void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
-+
-+static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
-+{
-+    tcg_gen_op1(opc, tcgv_i32_arg(a1));
-+}
-+
-+static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 a1)
-+{
-+    tcg_gen_op1(opc, tcgv_i64_arg(a1));
-+}
-+
-+static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg a1)
-+{
-+    tcg_gen_op1(opc, a1);
-+}
-+
-+static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2)
-+{
-+    tcg_gen_op2(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2));
-+}
-+
-+static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
-+{
-+    tcg_gen_op2(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2));
-+}
-+
-+static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 a1, TCGArg a2)
-+{
-+    tcg_gen_op2(opc, tcgv_i32_arg(a1), a2);
-+}
-+
-+static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 a1, TCGArg a2)
-+{
-+    tcg_gen_op2(opc, tcgv_i64_arg(a1), a2);
-+}
-+
-+static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg a1, TCGArg a2)
-+{
-+    tcg_gen_op2(opc, a1, a2);
-+}
-+
-+static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 a1,
-+                                   TCGv_i32 a2, TCGv_i32 a3)
-+{
-+    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), tcgv_i32_arg(a3));
-+}
-+
-+static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 a1,
-+                                   TCGv_i64 a2, TCGv_i64 a3)
-+{
-+    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), tcgv_i64_arg(a3));
-+}
-+
-+static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 a1,
-+                                    TCGv_i32 a2, TCGArg a3)
-+{
-+    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3);
-+}
-+
-+static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 a1,
-+                                    TCGv_i64 a2, TCGArg a3)
-+{
-+    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3);
-+}
-+
-+static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
-+                                       TCGv_ptr base, TCGArg offset)
-+{
-+    tcg_gen_op3(opc, tcgv_i32_arg(val), tcgv_ptr_arg(base), offset);
-+}
-+
-+static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
-+                                       TCGv_ptr base, TCGArg offset)
-+{
-+    tcg_gen_op3(opc, tcgv_i64_arg(val), tcgv_ptr_arg(base), offset);
-+}
-+
-+static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                   TCGv_i32 a3, TCGv_i32 a4)
-+{
-+    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), tcgv_i32_arg(a4));
-+}
-+
-+static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                   TCGv_i64 a3, TCGv_i64 a4)
-+{
-+    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), tcgv_i64_arg(a4));
-+}
-+
-+static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                    TCGv_i32 a3, TCGArg a4)
-+{
-+    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), a4);
-+}
-+
-+static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                    TCGv_i64 a3, TCGArg a4)
-+{
-+    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), a4);
-+}
-+
-+static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                     TCGArg a3, TCGArg a4)
-+{
-+    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3, a4);
-+}
-+
-+static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                     TCGArg a3, TCGArg a4)
-+{
-+    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3, a4);
-+}
-+
-+static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                   TCGv_i32 a3, TCGv_i32 a4, TCGv_i32 a5)
-+{
-+    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5));
-+}
-+
-+static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                   TCGv_i64 a3, TCGv_i64 a4, TCGv_i64 a5)
-+{
-+    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5));
-+}
-+
-+static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                    TCGv_i32 a3, TCGv_i32 a4, TCGArg a5)
-+{
-+    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5);
-+}
-+
-+static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                    TCGv_i64 a3, TCGv_i64 a4, TCGArg a5)
-+{
-+    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5);
-+}
-+
-+static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                     TCGv_i32 a3, TCGArg a4, TCGArg a5)
-+{
-+    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), a4, a5);
-+}
-+
-+static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                     TCGv_i64 a3, TCGArg a4, TCGArg a5)
-+{
-+    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), a4, a5);
-+}
-+
-+static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                   TCGv_i32 a3, TCGv_i32 a4,
-+                                   TCGv_i32 a5, TCGv_i32 a6)
-+{
-+    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5),
-+                tcgv_i32_arg(a6));
-+}
-+
-+static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                   TCGv_i64 a3, TCGv_i64 a4,
-+                                   TCGv_i64 a5, TCGv_i64 a6)
-+{
-+    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5),
-+                tcgv_i64_arg(a6));
-+}
-+
-+static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                    TCGv_i32 a3, TCGv_i32 a4,
-+                                    TCGv_i32 a5, TCGArg a6)
-+{
-+    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5), a6);
-+}
-+
-+static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                    TCGv_i64 a3, TCGv_i64 a4,
-+                                    TCGv_i64 a5, TCGArg a6)
-+{
-+    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5), a6);
-+}
-+
-+static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-+                                     TCGv_i32 a3, TCGv_i32 a4,
-+                                     TCGArg a5, TCGArg a6)
-+{
-+    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5, a6);
-+}
-+
-+static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-+                                     TCGv_i64 a3, TCGv_i64 a4,
-+                                     TCGArg a5, TCGArg a6)
-+{
-+    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5, a6);
-+}
-+
-+
-+/* Generic ops.  */
-+
-+static inline void gen_set_label(TCGLabel *l)
-+{
-+    l->present = 1;
-+    tcg_gen_op1(INDEX_op_set_label, label_arg(l));
-+}
-+
-+void tcg_gen_br(TCGLabel *l);
-+void tcg_gen_mb(TCGBar);
-+
-+/**
-+ * tcg_gen_exit_tb() - output exit_tb TCG operation
-+ * @tb: The TranslationBlock from which we are exiting
-+ * @idx: Direct jump slot index, or exit request
-+ *
-+ * See tcg/README for more info about this TCG operation.
-+ * See also tcg.h and the block comment above TB_EXIT_MASK.
-+ *
-+ * For a normal exit from the TB, back to the main loop, @tb should
-+ * be NULL and @idx should be 0.  Otherwise, @tb should be valid and
-+ * @idx should be one of the TB_EXIT_ values.
-+ */
-+void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx);
-+
-+/**
-+ * tcg_gen_goto_tb() - output goto_tb TCG operation
-+ * @idx: Direct jump slot index (0 or 1)
-+ *
-+ * See tcg/README for more info about this TCG operation.
-+ *
-+ * NOTE: In softmmu emulation, direct jumps with goto_tb are only safe within
-+ * the pages this TB resides in because we don't take care of direct jumps when
-+ * address mapping changes, e.g. in tlb_flush(). In user mode, there's only a
-+ * static address translation, so the destination address is always valid, TBs
-+ * are always invalidated properly, and direct jumps are reset when mapping
-+ * changes.
-+ */
-+void tcg_gen_goto_tb(unsigned idx);
-+
-+/**
-+ * tcg_gen_lookup_and_goto_ptr() - look up the current TB, jump to it if valid
-+ * @addr: Guest address of the target TB
-+ *
-+ * If the TB is not valid, jump to the epilogue.
-+ *
-+ * This operation is optional. If the TCG backend does not implement goto_ptr,
-+ * this op is equivalent to calling tcg_gen_exit_tb() with 0 as the argument.
-+ */
-+void tcg_gen_lookup_and_goto_ptr(void);
-+
-+static inline void tcg_gen_plugin_cb_start(unsigned from, unsigned type,
-+                                           unsigned wr)
-+{
-+    tcg_gen_op3(INDEX_op_plugin_cb_start, from, type, wr);
-+}
-+
-+static inline void tcg_gen_plugin_cb_end(void)
-+{
-+    tcg_emit_op(INDEX_op_plugin_cb_end, 0);
-+}
-+
-+/* 32 bit ops */
-+
-+void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
-+void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
-+void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
-+void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
-+void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
-+void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
-+                         unsigned int ofs, unsigned int len);
-+void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
-+                           unsigned int ofs, unsigned int len);
-+void tcg_gen_extract_i32(TCGv_i32 ret, TCGv_i32 arg,
-+                         unsigned int ofs, unsigned int len);
-+void tcg_gen_sextract_i32(TCGv_i32 ret, TCGv_i32 arg,
-+                          unsigned int ofs, unsigned int len);
-+void tcg_gen_extract2_i32(TCGv_i32 ret, TCGv_i32 al, TCGv_i32 ah,
-+                          unsigned int ofs);
-+void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, TCGLabel *);
-+void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *);
-+void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
-+                         TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
-+                          TCGv_i32 arg1, int32_t arg2);
-+void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
-+                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2);
-+void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
-+void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
-+void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags);
-+void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg);
-+void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-+void tcg_gen_abs_i32(TCGv_i32, TCGv_i32);
-+
-+/* Replicate a value of size @vece from @in to all the lanes in @out */
-+void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in);
-+
-+static inline void tcg_gen_discard_i32(TCGv_i32 arg)
-+{
-+    tcg_gen_op1_i32(INDEX_op_discard, arg);
-+}
-+
-+static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
-+{
-+    if (ret != arg) {
-+        tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
-+    }
-+}
-+
-+static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2,
-+                                     tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2,
-+                                     tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2,
-+                                  tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2,
-+                                   tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2,
-+                                  tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_add_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-+{
-+    tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
-+{
-+    if (TCG_TARGET_HAS_neg_i32) {
-+        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
-+    } else {
-+        tcg_gen_subfi_i32(ret, 0, arg);
-+    }
-+}
-+
-+static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
-+{
-+    if (TCG_TARGET_HAS_not_i32) {
-+        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
-+    } else {
-+        tcg_gen_xori_i32(ret, arg, -1);
-+    }
-+}
-+
-+/* 64 bit ops */
-+
-+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
-+void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
-+void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_clz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
-+void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
-+void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
-+void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
-+                         unsigned int ofs, unsigned int len);
-+void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
-+                           unsigned int ofs, unsigned int len);
-+void tcg_gen_extract_i64(TCGv_i64 ret, TCGv_i64 arg,
-+                         unsigned int ofs, unsigned int len);
-+void tcg_gen_sextract_i64(TCGv_i64 ret, TCGv_i64 arg,
-+                          unsigned int ofs, unsigned int len);
-+void tcg_gen_extract2_i64(TCGv_i64 ret, TCGv_i64 al, TCGv_i64 ah,
-+                          unsigned int ofs);
-+void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *);
-+void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *);
-+void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
-+                         TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
-+                          TCGv_i64 arg1, int64_t arg2);
-+void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
-+                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2);
-+void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
-+void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
-+void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
-+void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
-+void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_abs_i64(TCGv_i64, TCGv_i64);
-+
-+/* Replicate a value of size @vece from @in to all the lanes in @out */
-+void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in);
-+
-+#if TCG_TARGET_REG_BITS == 64
-+static inline void tcg_gen_discard_i64(TCGv_i64 arg)
-+{
-+    tcg_gen_op1_i64(INDEX_op_discard, arg);
-+}
-+
-+static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
-+{
-+    if (ret != arg) {
-+        tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
-+    }
-+}
-+
-+static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld8u_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld8s_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                     tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld16u_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                     tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld16s_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                     tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld32u_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                     tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
-+                                  tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-+                                   tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_st8_i64, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_st16_i64, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-+                                    tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-+                                  tcg_target_long offset)
-+{
-+    tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
-+}
-+
-+static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_sub_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
-+}
-+
-+static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-+{
-+    tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
-+}
-+#else /* TCG_TARGET_REG_BITS == 32 */
-+void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-+
-+void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+
-+void tcg_gen_discard_i64(TCGv_i64 arg);
-+void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
-+void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-+void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-+#endif /* TCG_TARGET_REG_BITS */
-+
-+static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
-+{
-+    if (TCG_TARGET_HAS_neg_i64) {
-+        tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
-+    } else {
-+        tcg_gen_subfi_i64(ret, 0, arg);
-+    }
-+}
-+
-+/* Size changing operations.  */
-+
-+void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
-+void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
-+void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
-+void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
-+void tcg_gen_extrh_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
-+void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
-+void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
-+
-+void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src);
-+void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg);
-+void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi);
-+
-+static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
-+{
-+    tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
-+}
-+
-+/* Local load/store bit ops */
-+
-+void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
-+void tcg_gen_qemu_st_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
-+void tcg_gen_qemu_ld_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
-+void tcg_gen_qemu_st_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
-+void tcg_gen_qemu_ld_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
-+void tcg_gen_qemu_st_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
-+
-+/* Atomic ops */
-+
-+void tcg_gen_atomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
-+                                    TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
-+                                    TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
-+                                     TCGv_i128, TCGArg, MemOp, TCGType);
-+
-+void tcg_gen_nonatomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_nonatomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_nonatomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
-+                                        TCGv_i128, TCGArg, MemOp, TCGType);
-+
-+void tcg_gen_atomic_xchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                 TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_xchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                 TCGArg, MemOp, TCGType);
-+
-+void tcg_gen_atomic_fetch_add_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_add_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_and_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_and_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_or_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                     TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_or_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                     TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_xor_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_xor_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_smin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_smin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_umin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_umin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_smax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_smax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_umax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_fetch_umax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+
-+void tcg_gen_atomic_add_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_add_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_and_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_and_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_or_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                     TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_or_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                     TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_xor_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_xor_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                      TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_smin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_smin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_umin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_umin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_smax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_smax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_umax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-+                                       TCGArg, MemOp, TCGType);
-+void tcg_gen_atomic_umax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-+                                       TCGArg, MemOp, TCGType);
-+
-+/* Vector ops */
-+
-+void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
-+void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
-+void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
-+void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
-+void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
-+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
-+void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
-+void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
-+void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-+
-+void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-+void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-+void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-+void tcg_gen_rotli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-+void tcg_gen_rotri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-+
-+void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-+void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-+void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-+void tcg_gen_rotls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-+
-+void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-+void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-+void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-+void tcg_gen_rotlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-+void tcg_gen_rotrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-+
-+void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
-+                     TCGv_vec a, TCGv_vec b);
-+
-+void tcg_gen_bitsel_vec(unsigned vece, TCGv_vec r, TCGv_vec a,
-+                        TCGv_vec b, TCGv_vec c);
-+void tcg_gen_cmpsel_vec(TCGCond cond, unsigned vece, TCGv_vec r,
-+                        TCGv_vec a, TCGv_vec b, TCGv_vec c, TCGv_vec d);
-+
-+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
-+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
-+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
-+
-+/* Host pointer ops */
-+
-+#if UINTPTR_MAX == UINT32_MAX
-+# define PTR  i32
-+# define NAT  TCGv_i32
-+#else
-+# define PTR  i64
-+# define NAT  TCGv_i64
-+#endif
-+
-+static inline void tcg_gen_ld_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
-+{
-+    glue(tcg_gen_ld_,PTR)((NAT)r, a, o);
-+}
-+
-+static inline void tcg_gen_st_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
-+{
-+    glue(tcg_gen_st_, PTR)((NAT)r, a, o);
-+}
-+
-+static inline void tcg_gen_discard_ptr(TCGv_ptr a)
-+{
-+    glue(tcg_gen_discard_,PTR)((NAT)a);
-+}
-+
-+static inline void tcg_gen_add_ptr(TCGv_ptr r, TCGv_ptr a, TCGv_ptr b)
-+{
-+    glue(tcg_gen_add_,PTR)((NAT)r, (NAT)a, (NAT)b);
-+}
-+
-+static inline void tcg_gen_addi_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t b)
-+{
-+    glue(tcg_gen_addi_,PTR)((NAT)r, (NAT)a, b);
-+}
-+
-+static inline void tcg_gen_mov_ptr(TCGv_ptr d, TCGv_ptr s)
-+{
-+    glue(tcg_gen_mov_,PTR)((NAT)d, (NAT)s);
-+}
-+
-+static inline void tcg_gen_movi_ptr(TCGv_ptr d, intptr_t s)
-+{
-+    glue(tcg_gen_movi_,PTR)((NAT)d, s);
-+}
-+
-+static inline void tcg_gen_brcondi_ptr(TCGCond cond, TCGv_ptr a,
-+                                       intptr_t b, TCGLabel *label)
-+{
-+    glue(tcg_gen_brcondi_,PTR)(cond, (NAT)a, b, label);
-+}
-+
-+static inline void tcg_gen_ext_i32_ptr(TCGv_ptr r, TCGv_i32 a)
-+{
-+#if UINTPTR_MAX == UINT32_MAX
-+    tcg_gen_mov_i32((NAT)r, a);
-+#else
-+    tcg_gen_ext_i32_i64((NAT)r, a);
-+#endif
-+}
-+
-+static inline void tcg_gen_trunc_i64_ptr(TCGv_ptr r, TCGv_i64 a)
-+{
-+#if UINTPTR_MAX == UINT32_MAX
-+    tcg_gen_extrl_i64_i32((NAT)r, a);
-+#else
-+    tcg_gen_mov_i64((NAT)r, a);
-+#endif
-+}
-+
-+static inline void tcg_gen_extu_ptr_i64(TCGv_i64 r, TCGv_ptr a)
-+{
-+#if UINTPTR_MAX == UINT32_MAX
-+    tcg_gen_extu_i32_i64(r, (NAT)a);
-+#else
-+    tcg_gen_mov_i64(r, (NAT)a);
-+#endif
-+}
-+
-+static inline void tcg_gen_trunc_ptr_i32(TCGv_i32 r, TCGv_ptr a)
-+{
-+#if UINTPTR_MAX == UINT32_MAX
-+    tcg_gen_mov_i32(r, (NAT)a);
-+#else
-+    tcg_gen_extrl_i64_i32(r, (NAT)a);
-+#endif
-+}
-+
-+#undef PTR
-+#undef NAT
-+
-+#endif /* TCG_TCG_OP_COMMON_H */
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
-+++ b/include/tcg/tcg-op.h
-@@ -XXX,XX +XXX,XX @@
-+/* SPDX-License-Identifier: MIT */
- /*
-- * Tiny Code Generator for QEMU
-+ * Target dependent opcode generation functions.
-  *
-  * Copyright (c) 2008 Fabrice Bellard
-- *
-- * Permission is hereby granted, free of charge, to any person obtaining a copy
-- * of this software and associated documentation files (the "Software"), to deal
-- * in the Software without restriction, including without limitation the rights
-- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-- * copies of the Software, and to permit persons to whom the Software is
-- * furnished to do so, subject to the following conditions:
-- *
-- * The above copyright notice and this permission notice shall be included in
-- * all copies or substantial portions of the Software.
-- *
-- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-- * THE SOFTWARE.
-  */
- #ifndef TCG_TCG_OP_H
- #define TCG_TCG_OP_H
--#include "tcg/tcg.h"
--#include "exec/helper-proto.h"
--#include "exec/helper-gen.h"
--
--/* Basic output routines.  Not for general consumption.  */
--
--void tcg_gen_op1(TCGOpcode, TCGArg);
--void tcg_gen_op2(TCGOpcode, TCGArg, TCGArg);
--void tcg_gen_op3(TCGOpcode, TCGArg, TCGArg, TCGArg);
--void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
--void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
--void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
--
--void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
--void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
--void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
--
--static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
--{
--    tcg_gen_op1(opc, tcgv_i32_arg(a1));
--}
--
--static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 a1)
--{
--    tcg_gen_op1(opc, tcgv_i64_arg(a1));
--}
--
--static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg a1)
--{
--    tcg_gen_op1(opc, a1);
--}
--
--static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2)
--{
--    tcg_gen_op2(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2));
--}
--
--static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
--{
--    tcg_gen_op2(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2));
--}
--
--static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 a1, TCGArg a2)
--{
--    tcg_gen_op2(opc, tcgv_i32_arg(a1), a2);
--}
--
--static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 a1, TCGArg a2)
--{
--    tcg_gen_op2(opc, tcgv_i64_arg(a1), a2);
--}
--
--static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg a1, TCGArg a2)
--{
--    tcg_gen_op2(opc, a1, a2);
--}
--
--static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 a1,
--                                   TCGv_i32 a2, TCGv_i32 a3)
--{
--    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), tcgv_i32_arg(a3));
--}
--
--static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 a1,
--                                   TCGv_i64 a2, TCGv_i64 a3)
--{
--    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), tcgv_i64_arg(a3));
--}
--
--static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 a1,
--                                    TCGv_i32 a2, TCGArg a3)
--{
--    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3);
--}
--
--static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 a1,
--                                    TCGv_i64 a2, TCGArg a3)
--{
--    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3);
--}
--
--static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
--                                       TCGv_ptr base, TCGArg offset)
--{
--    tcg_gen_op3(opc, tcgv_i32_arg(val), tcgv_ptr_arg(base), offset);
--}
--
--static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
--                                       TCGv_ptr base, TCGArg offset)
--{
--    tcg_gen_op3(opc, tcgv_i64_arg(val), tcgv_ptr_arg(base), offset);
--}
--
--static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                   TCGv_i32 a3, TCGv_i32 a4)
--{
--    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), tcgv_i32_arg(a4));
--}
--
--static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                   TCGv_i64 a3, TCGv_i64 a4)
--{
--    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), tcgv_i64_arg(a4));
--}
--
--static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                    TCGv_i32 a3, TCGArg a4)
--{
--    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), a4);
--}
--
--static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                    TCGv_i64 a3, TCGArg a4)
--{
--    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), a4);
--}
--
--static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                     TCGArg a3, TCGArg a4)
--{
--    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3, a4);
--}
--
--static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                     TCGArg a3, TCGArg a4)
--{
--    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3, a4);
--}
--
--static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                   TCGv_i32 a3, TCGv_i32 a4, TCGv_i32 a5)
--{
--    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5));
--}
--
--static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                   TCGv_i64 a3, TCGv_i64 a4, TCGv_i64 a5)
--{
--    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5));
--}
--
--static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                    TCGv_i32 a3, TCGv_i32 a4, TCGArg a5)
--{
--    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5);
--}
--
--static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                    TCGv_i64 a3, TCGv_i64 a4, TCGArg a5)
--{
--    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5);
--}
--
--static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                     TCGv_i32 a3, TCGArg a4, TCGArg a5)
--{
--    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), a4, a5);
--}
--
--static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                     TCGv_i64 a3, TCGArg a4, TCGArg a5)
--{
--    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), a4, a5);
--}
--
--static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                   TCGv_i32 a3, TCGv_i32 a4,
--                                   TCGv_i32 a5, TCGv_i32 a6)
--{
--    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5),
--                tcgv_i32_arg(a6));
--}
--
--static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                   TCGv_i64 a3, TCGv_i64 a4,
--                                   TCGv_i64 a5, TCGv_i64 a6)
--{
--    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5),
--                tcgv_i64_arg(a6));
--}
--
--static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                    TCGv_i32 a3, TCGv_i32 a4,
--                                    TCGv_i32 a5, TCGArg a6)
--{
--    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5), a6);
--}
--
--static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                    TCGv_i64 a3, TCGv_i64 a4,
--                                    TCGv_i64 a5, TCGArg a6)
--{
--    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5), a6);
--}
--
--static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
--                                     TCGv_i32 a3, TCGv_i32 a4,
--                                     TCGArg a5, TCGArg a6)
--{
--    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
--                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5, a6);
--}
--
--static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
--                                     TCGv_i64 a3, TCGv_i64 a4,
--                                     TCGArg a5, TCGArg a6)
--{
--    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
--                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5, a6);
--}
--
--
--/* Generic ops.  */
--
--static inline void gen_set_label(TCGLabel *l)
--{
--    l->present = 1;
--    tcg_gen_op1(INDEX_op_set_label, label_arg(l));
--}
--
--void tcg_gen_br(TCGLabel *l);
--void tcg_gen_mb(TCGBar);
--
--/* Helper calls. */
--
--/* 32 bit ops */
--
--void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
--void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
--void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
--void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
--void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
--void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
--                         unsigned int ofs, unsigned int len);
--void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
--                           unsigned int ofs, unsigned int len);
--void tcg_gen_extract_i32(TCGv_i32 ret, TCGv_i32 arg,
--                         unsigned int ofs, unsigned int len);
--void tcg_gen_sextract_i32(TCGv_i32 ret, TCGv_i32 arg,
--                          unsigned int ofs, unsigned int len);
--void tcg_gen_extract2_i32(TCGv_i32 ret, TCGv_i32 al, TCGv_i32 ah,
--                          unsigned int ofs);
--void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, TCGLabel *);
--void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *);
--void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
--                         TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
--                          TCGv_i32 arg1, int32_t arg2);
--void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
--                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2);
--void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
--                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
--void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
--                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
--void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags);
--void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg);
--void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
--void tcg_gen_abs_i32(TCGv_i32, TCGv_i32);
--
--/* Replicate a value of size @vece from @in to all the lanes in @out */
--void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in);
--
--static inline void tcg_gen_discard_i32(TCGv_i32 arg)
--{
--    tcg_gen_op1_i32(INDEX_op_discard, arg);
--}
--
--static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
--{
--    if (ret != arg) {
--        tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
--    }
--}
--
--static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2,
--                                     tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2,
--                                     tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2,
--                                  tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
--}
--
--static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2,
--                                   tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2,
--                                  tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_add_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
--{
--    tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
--{
--    if (TCG_TARGET_HAS_neg_i32) {
--        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
--    } else {
--        tcg_gen_subfi_i32(ret, 0, arg);
--    }
--}
--
--static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
--{
--    if (TCG_TARGET_HAS_not_i32) {
--        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
--    } else {
--        tcg_gen_xori_i32(ret, arg, -1);
--    }
--}
--
--/* 64 bit ops */
--
--void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
--void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
--void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_clz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
--void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
--void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
--void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
--                         unsigned int ofs, unsigned int len);
--void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
--                           unsigned int ofs, unsigned int len);
--void tcg_gen_extract_i64(TCGv_i64 ret, TCGv_i64 arg,
--                         unsigned int ofs, unsigned int len);
--void tcg_gen_sextract_i64(TCGv_i64 ret, TCGv_i64 arg,
--                          unsigned int ofs, unsigned int len);
--void tcg_gen_extract2_i64(TCGv_i64 ret, TCGv_i64 al, TCGv_i64 ah,
--                          unsigned int ofs);
--void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *);
--void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *);
--void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
--                         TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
--                          TCGv_i64 arg1, int64_t arg2);
--void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
--                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2);
--void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
--                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
--void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
--                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
--void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
--void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
--void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_abs_i64(TCGv_i64, TCGv_i64);
--
--/* Replicate a value of size @vece from @in to all the lanes in @out */
--void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in);
--
--#if TCG_TARGET_REG_BITS == 64
--static inline void tcg_gen_discard_i64(TCGv_i64 arg)
--{
--    tcg_gen_op1_i64(INDEX_op_discard, arg);
--}
--
--static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
--{
--    if (ret != arg) {
--        tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
--    }
--}
--
--static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld8u_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld8s_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                     tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld16u_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                     tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld16s_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                     tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld32u_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                     tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
--                                  tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
--}
--
--static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
--                                   tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_st8_i64, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_st16_i64, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
--                                    tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
--                                  tcg_target_long offset)
--{
--    tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
--}
--
--static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_sub_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
--}
--
--static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
--{
--    tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
--}
--#else /* TCG_TARGET_REG_BITS == 32 */
--void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
--
--void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--
--void tcg_gen_discard_i64(TCGv_i64 arg);
--void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
--void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
--#endif /* TCG_TARGET_REG_BITS */
--
--static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
--{
--    if (TCG_TARGET_HAS_neg_i64) {
--        tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
--    } else {
--        tcg_gen_subfi_i64(ret, 0, arg);
--    }
--}
--
--/* Size changing operations.  */
--
--void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
--void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
--void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
--void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
--void tcg_gen_extrh_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
--void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
--void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
--
--void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src);
--void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg);
--void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi);
--
--static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
--{
--    tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
--}
--
--/* QEMU specific operations.  */
-+#include "tcg/tcg-op-common.h"
- #ifndef TARGET_LONG_BITS
- #error must include QEMU headers
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
- # error "Unhandled number of operands to insn_start"
- #endif
--/**
-- * tcg_gen_exit_tb() - output exit_tb TCG operation
-- * @tb: The TranslationBlock from which we are exiting
-- * @idx: Direct jump slot index, or exit request
-- *
-- * See tcg/README for more info about this TCG operation.
-- * See also tcg.h and the block comment above TB_EXIT_MASK.
-- *
-- * For a normal exit from the TB, back to the main loop, @tb should
-- * be NULL and @idx should be 0.  Otherwise, @tb should be valid and
-- * @idx should be one of the TB_EXIT_ values.
-- */
--void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx);
--
--/**
-- * tcg_gen_goto_tb() - output goto_tb TCG operation
-- * @idx: Direct jump slot index (0 or 1)
-- *
-- * See tcg/README for more info about this TCG operation.
-- *
-- * NOTE: In softmmu emulation, direct jumps with goto_tb are only safe within
-- * the pages this TB resides in because we don't take care of direct jumps when
-- * address mapping changes, e.g. in tlb_flush(). In user mode, there's only a
-- * static address translation, so the destination address is always valid, TBs
-- * are always invalidated properly, and direct jumps are reset when mapping
-- * changes.
-- */
--void tcg_gen_goto_tb(unsigned idx);
--
--/**
-- * tcg_gen_lookup_and_goto_ptr() - look up the current TB, jump to it if valid
-- * @addr: Guest address of the target TB
-- *
-- * If the TB is not valid, jump to the epilogue.
-- *
-- * This operation is optional. If the TCG backend does not implement goto_ptr,
-- * this op is equivalent to calling tcg_gen_exit_tb() with 0 as the argument.
-- */
--void tcg_gen_lookup_and_goto_ptr(void);
--
--static inline void tcg_gen_plugin_cb_start(unsigned from, unsigned type,
--                                           unsigned wr)
--{
--    tcg_gen_op3(INDEX_op_plugin_cb_start, from, type, wr);
--}
--
--static inline void tcg_gen_plugin_cb_end(void)
--{
--    tcg_emit_op(INDEX_op_plugin_cb_end, 0);
--}
--
- #if TARGET_LONG_BITS == 32
- typedef TCGv_i32 TCGv;
- #define tcg_temp_new() tcg_temp_new_i32()
-@@ -XXX,XX +XXX,XX @@ typedef TCGv_i64 TCGv;
- #error Unhandled TARGET_LONG_BITS value
- #endif
--void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
--void tcg_gen_qemu_st_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
--void tcg_gen_qemu_ld_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
--void tcg_gen_qemu_st_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
--void tcg_gen_qemu_ld_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
--void tcg_gen_qemu_st_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
--
- static inline void
- tcg_gen_qemu_ld_i32(TCGv_i32 v, TCGv a, TCGArg i, MemOp m)
- {
-@@ -XXX,XX +XXX,XX @@ tcg_gen_qemu_st_i128(TCGv_i128 v, TCGv a, TCGArg i, MemOp m)
-     tcg_gen_qemu_st_i128_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL);
- }
--void tcg_gen_atomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
--                                    TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
--                                    TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
--                                     TCGv_i128, TCGArg, MemOp, TCGType);
--
--void tcg_gen_nonatomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_nonatomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_nonatomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
--                                        TCGv_i128, TCGArg, MemOp, TCGType);
--
--void tcg_gen_atomic_xchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                 TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_xchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                 TCGArg, MemOp, TCGType);
--
--void tcg_gen_atomic_fetch_add_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_add_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_and_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_and_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_or_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                     TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_or_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                     TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_xor_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_xor_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_smin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_smin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_umin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_umin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_smax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_smax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_umax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_fetch_umax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--
--void tcg_gen_atomic_add_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_add_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_and_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_and_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_or_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                     TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_or_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                     TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_xor_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_xor_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                      TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_smin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_smin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_umin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_umin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_smax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_smax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_umax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
--                                       TCGArg, MemOp, TCGType);
--void tcg_gen_atomic_umax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
--                                       TCGArg, MemOp, TCGType);
--
- #define DEF_ATOMIC2(N, S)                                               \
-     static inline void N##_##S(TCGv_##S r, TCGv a, TCGv_##S v,          \
-                                TCGArg i, MemOp m)                       \
-@@ -XXX,XX +XXX,XX @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64)
- #undef DEF_ATOMIC2
- #undef DEF_ATOMIC3
--void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
--void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
--void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
--void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
--void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
--void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
--void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
--void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
--void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--
--void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
--void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
--void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
--void tcg_gen_rotli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
--void tcg_gen_rotri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
--
--void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
--void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
--void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
--void tcg_gen_rotls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
--
--void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
--void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
--void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
--void tcg_gen_rotlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
--void tcg_gen_rotrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
--
--void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
--                     TCGv_vec a, TCGv_vec b);
--
--void tcg_gen_bitsel_vec(unsigned vece, TCGv_vec r, TCGv_vec a,
--                        TCGv_vec b, TCGv_vec c);
--void tcg_gen_cmpsel_vec(TCGCond cond, unsigned vece, TCGv_vec r,
--                        TCGv_vec a, TCGv_vec b, TCGv_vec c, TCGv_vec d);
--
--void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
--void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
--void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
--
- #if TARGET_LONG_BITS == 64
- #define tcg_gen_movi_tl tcg_gen_movi_i64
- #define tcg_gen_mov_tl tcg_gen_mov_i64
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
-         : (VECE) == MO_32 ? 0x00000001ul * (uint32_t)(C)           \
-         : (qemu_build_not_reached_always(), 0))                    \
-      :  (target_long)dup_const(VECE, C))
--#endif
--
--#if UINTPTR_MAX == UINT32_MAX
--# define PTR  i32
--# define NAT  TCGv_i32
--#else
--# define PTR  i64
--# define NAT  TCGv_i64
--#endif
--
--static inline void tcg_gen_ld_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
--{
--    glue(tcg_gen_ld_,PTR)((NAT)r, a, o);
--}
--
--static inline void tcg_gen_st_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
--{
--    glue(tcg_gen_st_, PTR)((NAT)r, a, o);
--}
--
--static inline void tcg_gen_discard_ptr(TCGv_ptr a)
--{
--    glue(tcg_gen_discard_,PTR)((NAT)a);
--}
--
--static inline void tcg_gen_add_ptr(TCGv_ptr r, TCGv_ptr a, TCGv_ptr b)
--{
--    glue(tcg_gen_add_,PTR)((NAT)r, (NAT)a, (NAT)b);
--}
--
--static inline void tcg_gen_addi_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t b)
--{
--    glue(tcg_gen_addi_,PTR)((NAT)r, (NAT)a, b);
--}
--
--static inline void tcg_gen_mov_ptr(TCGv_ptr d, TCGv_ptr s)
--{
--    glue(tcg_gen_mov_,PTR)((NAT)d, (NAT)s);
--}
--
--static inline void tcg_gen_movi_ptr(TCGv_ptr d, intptr_t s)
--{
--    glue(tcg_gen_movi_,PTR)((NAT)d, s);
--}
--
--static inline void tcg_gen_brcondi_ptr(TCGCond cond, TCGv_ptr a,
--                                       intptr_t b, TCGLabel *label)
--{
--    glue(tcg_gen_brcondi_,PTR)(cond, (NAT)a, b, label);
--}
--
--static inline void tcg_gen_ext_i32_ptr(TCGv_ptr r, TCGv_i32 a)
--{
--#if UINTPTR_MAX == UINT32_MAX
--    tcg_gen_mov_i32((NAT)r, a);
--#else
--    tcg_gen_ext_i32_i64((NAT)r, a);
--#endif
--}
--
--static inline void tcg_gen_trunc_i64_ptr(TCGv_ptr r, TCGv_i64 a)
--{
--#if UINTPTR_MAX == UINT32_MAX
--    tcg_gen_extrl_i64_i32((NAT)r, a);
--#else
--    tcg_gen_mov_i64((NAT)r, a);
--#endif
--}
--
--static inline void tcg_gen_extu_ptr_i64(TCGv_i64 r, TCGv_ptr a)
--{
--#if UINTPTR_MAX == UINT32_MAX
--    tcg_gen_extu_i32_i64(r, (NAT)a);
--#else
--    tcg_gen_mov_i64(r, (NAT)a);
--#endif
--}
--
--static inline void tcg_gen_trunc_ptr_i32(TCGv_i32 r, TCGv_ptr a)
--{
--#if UINTPTR_MAX == UINT32_MAX
--    tcg_gen_mov_i32(r, (NAT)a);
--#else
--    tcg_gen_extrl_i64_i32(r, (NAT)a);
--#endif
--}
--
--#undef PTR
--#undef NAT
-+#endif /* TARGET_LONG_BITS == 64 */
- #endif /* TCG_TCG_OP_H */
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
+         g_assert_not_reached();
- #include "qemu/osdep.h"
+     }
- #include "qemu/int128.h"
--#include "tcg/tcg-op.h"
+-    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-+#include "tcg/tcg-op-common.h"
++    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
- #include "tcg-internal.h"
+         return true;
+     }
- #define CASE_OP_32_64(x)                        \
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+     s_mask = s_mask_old >> pos;
---- a/tcg/tcg-op-gvec.c
+     s_mask |= -1ull << (len - 1);
-+++ b/tcg/tcg-op-gvec.c
-@@ -XXX,XX +XXX,XX @@
+-    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
- #include "qemu/osdep.h"
++    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
- #include "tcg/tcg.h"
+         return true;
- #include "tcg/tcg-temp-internal.h"
+     }
 -#include "tcg/tcg-op.h"
 +#include "tcg/tcg-op-common.h"
  #include "tcg/tcg-op-gvec.h"
  #include "tcg/tcg-gvec-desc.h"
 diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-ldst.c
 +++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "tcg/tcg.h"
  #include "tcg/tcg-temp-internal.h"
 -#include "tcg/tcg-op.h"
 +#include "tcg/tcg-op-common.h"
  #include "tcg/tcg-mo.h"
  #include "exec/plugin-gen.h"
  #include "tcg-internal.h"
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/osdep.h"
  #include "tcg/tcg.h"
  #include "tcg/tcg-temp-internal.h"
 -#include "tcg/tcg-op.h"
 +#include "tcg/tcg-op-common.h"
  #include "tcg/tcg-mo.h"
  #include "tcg-internal.h"
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "tcg/tcg.h"
  #include "tcg/tcg-temp-internal.h"
 -#include "tcg/tcg-op.h"
 +#include "tcg/tcg-op-common.h"
  #include "exec/plugin-gen.h"
  #include "tcg-internal.h"
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "exec/tlb-common.h"
 -#include "tcg/tcg-op.h"
 +#include "tcg/tcg-op-common.h"
  #if UINTPTR_MAX == UINT32_MAX
  # define ELF_CLASS  ELFCLASS32
 diff --git a/tcg/tci.c b/tcg/tci.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci.c
 +++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 -#include "exec/cpu_ldst.h"
 -#include "tcg/tcg-op.h"
 +#include "tcg/tcg.h"
  #include "tcg/tcg-ldst.h"
  #include <ffi.h>
 --
-.34.1
+.43.0

-[PULL 43/52] accel/tcg: Tidy includes for translator.[ch]
+[PULL 53/72] tcg/optimize: Move fold_bitsel_vec into alphabetic sort
-Reduce the header to only bswap.h and cpu_ldst.h.
+The big comment just above says functions should be sorted.
-Move exec/translate-all.h to translator.c.
+Add forward declarations as needed.
 Reduce tcg.h and tcg-op.h to tcg-op-common.h.
 Remove otherwise unused headers.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/translator.h | 6 +-----
+ tcg/optimize.c | 114 +++++++++++++++++++++++++------------------------
- accel/tcg/translator.c    | 8 +++-----
+file changed, 59 insertions(+), 55 deletions(-)
 files changed, 4 insertions(+), 10 deletions(-)
-diff --git a/include/exec/translator.h b/include/exec/translator.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/translator.h
+--- a/tcg/optimize.c
-+++ b/include/exec/translator.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
-  * member in your target-specific DisasContext.
+  *   3) those that produce information about the result value.
   */
++static bool fold_or(OptContext *ctx, TCGOp *op);
++static bool fold_orc(OptContext *ctx, TCGOp *op);
++static bool fold_xor(OptContext *ctx, TCGOp *op);
++
+ static bool fold_add(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2_commutative(ctx, op) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
++static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
++{
++    /* If true and false values are the same, eliminate the cmp. */
++    if (args_are_copies(op->args[2], op->args[3])) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
++    }
++
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
++        uint64_t tv = arg_info(op->args[2])->val;
++        uint64_t fv = arg_info(op->args[3])->val;
++
++        if (tv == -1 && fv == 0) {
++            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++        }
++        if (tv == 0 && fv == -1) {
++            if (TCG_TARGET_HAS_not_vec) {
++                op->opc = INDEX_op_not_vec;
++                return fold_not(ctx, op);
++            } else {
++                op->opc = INDEX_op_xor_vec;
++                op->args[2] = arg_new_constant(ctx, -1);
++                return fold_xor(ctx, op);
++            }
++        }
++    }
++    if (arg_is_const(op->args[2])) {
++        uint64_t tv = arg_info(op->args[2])->val;
++        if (tv == -1) {
++            op->opc = INDEX_op_or_vec;
++            op->args[2] = op->args[3];
++            return fold_or(ctx, op);
++        }
++        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
++            op->opc = INDEX_op_andc_vec;
++            op->args[2] = op->args[1];
++            op->args[1] = op->args[3];
++            return fold_andc(ctx, op);
++        }
++    }
++    if (arg_is_const(op->args[3])) {
++        uint64_t fv = arg_info(op->args[3])->val;
++        if (fv == 0) {
++            op->opc = INDEX_op_and_vec;
++            return fold_and(ctx, op);
++        }
++        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
++            op->opc = INDEX_op_orc_vec;
++            op->args[2] = op->args[1];
++            op->args[1] = op->args[3];
++            return fold_orc(ctx, op);
++        }
++    }
++    return finish_folding(ctx, op);
++}
++
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+ {
+     int i = do_constant_folding_cond1(ctx, op, NO_DEST, &op->args[0],
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
+     return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+-static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
+-{
+-    /* If true and false values are the same, eliminate the cmp. */
+-    if (args_are_copies(op->args[2], op->args[3])) {
+-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
+-    }
 -
- #include "qemu/bswap.h"
+-    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
--#include "exec/exec-all.h"
+-        uint64_t tv = arg_info(op->args[2])->val;
--#include "exec/cpu_ldst.h"
+-        uint64_t fv = arg_info(op->args[3])->val;
 -#include "exec/translate-all.h"
 -#include "tcg/tcg.h"
 +#include "exec/cpu_ldst.h"    /* for abi_ptr */
  /**
   * gen_intermediate_code
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translator.c
 +++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@
   */
  #include "qemu/osdep.h"
 +#include "qemu/log.h"
  #include "qemu/error-report.h"
 -#include "tcg/tcg.h"
 -#include "tcg/tcg-op.h"
  #include "exec/exec-all.h"
 -#include "exec/log.h"
  #include "exec/translator.h"
 +#include "exec/translate-all.h"
  #include "exec/plugin-gen.h"
 -#include "exec/replay-core.h"
 -
-+#include "tcg/tcg-op-common.h"
+-        if (tv == -1 && fv == 0) {
+-            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
- static void gen_io_start(void)
+-        }
 -        if (tv == 0 && fv == -1) {
 -            if (TCG_TARGET_HAS_not_vec) {
 -                op->opc = INDEX_op_not_vec;
 -                return fold_not(ctx, op);
 -            } else {
 -                op->opc = INDEX_op_xor_vec;
 -                op->args[2] = arg_new_constant(ctx, -1);
 -                return fold_xor(ctx, op);
 -            }
 -        }
 -    }
 -    if (arg_is_const(op->args[2])) {
 -        uint64_t tv = arg_info(op->args[2])->val;
 -        if (tv == -1) {
 -            op->opc = INDEX_op_or_vec;
 -            op->args[2] = op->args[3];
 -            return fold_or(ctx, op);
 -        }
 -        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
 -            op->opc = INDEX_op_andc_vec;
 -            op->args[2] = op->args[1];
 -            op->args[1] = op->args[3];
 -            return fold_andc(ctx, op);
 -        }
 -    }
 -    if (arg_is_const(op->args[3])) {
 -        uint64_t fv = arg_info(op->args[3])->val;
 -        if (fv == 0) {
 -            op->opc = INDEX_op_and_vec;
 -            return fold_and(ctx, op);
 -        }
 -        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
 -            op->opc = INDEX_op_orc_vec;
 -            op->args[2] = op->args[1];
 -            op->args[1] = op->args[3];
 -            return fold_orc(ctx, op);
 -        }
 -    }
 -    return finish_folding(ctx, op);
 -}
 -
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
 --
-.34.1
+.43.0

-[PULL 35/52] accel/tcg: Move most of gen-icount.h into translator.c
+[PULL 54/72] tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
-The only usage of gen_tb_start and gen_tb_end are here.
+The big comment just above says functions should be sorted.
 Move the static icount_start_insn variable into a local
 within translator_loop.  Simplify the two subroutines
 by passing in the existing local cflags variable.
-Leave only the declaration of gen_io_start in gen-icount.h.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/gen-icount.h | 79 +------------------------------------
+ tcg/optimize.c | 60 +++++++++++++++++++++++++-------------------------
- accel/tcg/translator.c    | 83 ++++++++++++++++++++++++++++++++++++++-
+file changed, 30 insertions(+), 30 deletions(-)
 files changed, 82 insertions(+), 80 deletions(-)
-diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/gen-icount.h
+--- a/tcg/optimize.c
-+++ b/include/exec/gen-icount.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
- #ifndef GEN_ICOUNT_H
+     return true;
- #define GEN_ICOUNT_H
+ }
--#include "exec/exec-all.h"
++static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
--
++{
--/* Helpers for instruction counting code generation.  */
++    /* Canonicalize the comparison to put immediate second. */
--
++    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
--static TCGOp *icount_start_insn;
++        op->args[3] = tcg_swap_cond(op->args[3]);
--
++    }
--static inline void gen_io_start(void)
++    return finish_folding(ctx, op);
 +}
 +
 +static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
 +{
 +    /* If true and false values are the same, eliminate the cmp. */
 +    if (args_are_copies(op->args[3], op->args[4])) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
 +    }
 +
 +    /* Canonicalize the comparison to put immediate second. */
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = tcg_swap_cond(op->args[5]);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination,
 +     * so that the tcg backend can implement "move if true".
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = tcg_invert_cond(op->args[5]);
 +    }
 +    return finish_folding(ctx, op);
 +}
 +
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask, s_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
      return tcg_opt_gen_movi(ctx, op, op->args[0], i);
  }
 -static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
 -{
--    tcg_gen_st_i32(tcg_constant_i32(1), cpu_env,
+-    /* Canonicalize the comparison to put immediate second. */
--                   offsetof(ArchCPU, parent_obj.can_do_io) -
+-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
--                   offsetof(ArchCPU, env));
+-        op->args[3] = tcg_swap_cond(op->args[3]);
 -    }
 -    return finish_folding(ctx, op);
 -}
 -
--static inline void gen_tb_start(const TranslationBlock *tb)
+-static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
 -{
--    TCGv_i32 count = tcg_temp_new_i32();
+-    /* If true and false values are the same, eliminate the cmp. */
--
+-    if (args_are_copies(op->args[3], op->args[4])) {
--    tcg_gen_ld_i32(count, cpu_env,
+-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
 -                   offsetof(ArchCPU, neg.icount_decr.u32) -
 -                   offsetof(ArchCPU, env));
 -
 -    if (tb_cflags(tb) & CF_USE_ICOUNT) {
 -        /*
 -         * We emit a sub with a dummy immediate argument. Keep the insn index
 -         * of the sub so that we later (when we know the actual insn count)
 -         * can update the argument with the actual insn count.
 -         */
 -        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
 -        icount_start_insn = tcg_last_op();
 -    }
 -
+-    /* Canonicalize the comparison to put immediate second. */
+-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+-        op->args[5] = tcg_swap_cond(op->args[5]);
+-    }
 -    /*
--     * Emit the check against icount_decr.u32 to see if we should exit
+-     * Canonicalize the "false" input reg to match the destination,
--     * unless we suppress the check with CF_NOIRQ. If we are using
+-     * so that the tcg backend can implement "move if true".
 -     * icount and have suppressed interruption the higher level code
 -     * should have ensured we don't run more instructions than the
 -     * budget.
 -     */
--    if (tb_cflags(tb) & CF_NOIRQ) {
+-    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
--        tcg_ctx->exitreq_label = NULL;
+-        op->args[5] = tcg_invert_cond(op->args[5]);
 -    } else {
 -        tcg_ctx->exitreq_label = gen_new_label();
 -        tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
 -    }
--
+-    return finish_folding(ctx, op);
 -    if (tb_cflags(tb) & CF_USE_ICOUNT) {
 -        tcg_gen_st16_i32(count, cpu_env,
 -                         offsetof(ArchCPU, neg.icount_decr.u16.low) -
 -                         offsetof(ArchCPU, env));
 -        /*
 -         * cpu->can_do_io is cleared automatically here at the beginning of
 -         * each translation block.  The cost is minimal and only paid for
 -         * -icount, plus it would be very easy to forget doing it in the
 -         * translator. Doing it here means we don't need a gen_io_end() to
 -         * go with gen_io_start().
 -         */
 -        tcg_gen_st_i32(tcg_constant_i32(0), cpu_env,
 -                       offsetof(ArchCPU, parent_obj.can_do_io) -
 -                       offsetof(ArchCPU, env));
 -    }
 -}
 -
--static inline void gen_tb_end(const TranslationBlock *tb, int num_insns)
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 -{
 -    if (tb_cflags(tb) & CF_USE_ICOUNT) {
 -        /*
 -         * Update the num_insn immediate parameter now that we know
 -         * the actual insn count.
 -         */
 -        tcg_set_insn_param(icount_start_insn, 2,
 -                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
 -    }
 -
 -    if (tcg_ctx->exitreq_label) {
 -        gen_set_label(tcg_ctx->exitreq_label);
 -        tcg_gen_exit_tb(tb, TB_EXIT_REQUESTED);
 -    }
 -}
 +void gen_io_start(void);
  #endif
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translator.c
 +++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/plugin-gen.h"
  #include "exec/replay-core.h"
 +
 +void gen_io_start(void)
 +{
 +    tcg_gen_st_i32(tcg_constant_i32(1), cpu_env,
 +                   offsetof(ArchCPU, parent_obj.can_do_io) -
 +                   offsetof(ArchCPU, env));
 +}
 +
 +static TCGOp *gen_tb_start(uint32_t cflags)
 +{
 +    TCGv_i32 count = tcg_temp_new_i32();
 +    TCGOp *icount_start_insn = NULL;
 +
 +    tcg_gen_ld_i32(count, cpu_env,
 +                   offsetof(ArchCPU, neg.icount_decr.u32) -
 +                   offsetof(ArchCPU, env));
 +
 +    if (cflags & CF_USE_ICOUNT) {
 +        /*
 +         * We emit a sub with a dummy immediate argument. Keep the insn index
 +         * of the sub so that we later (when we know the actual insn count)
 +         * can update the argument with the actual insn count.
 +         */
 +        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
 +        icount_start_insn = tcg_last_op();
 +    }
 +
 +    /*
 +     * Emit the check against icount_decr.u32 to see if we should exit
 +     * unless we suppress the check with CF_NOIRQ. If we are using
 +     * icount and have suppressed interruption the higher level code
 +     * should have ensured we don't run more instructions than the
 +     * budget.
 +     */
 +    if (cflags & CF_NOIRQ) {
 +        tcg_ctx->exitreq_label = NULL;
 +    } else {
 +        tcg_ctx->exitreq_label = gen_new_label();
 +        tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
 +    }
 +
 +    if (cflags & CF_USE_ICOUNT) {
 +        tcg_gen_st16_i32(count, cpu_env,
 +                         offsetof(ArchCPU, neg.icount_decr.u16.low) -
 +                         offsetof(ArchCPU, env));
 +        /*
 +         * cpu->can_do_io is cleared automatically here at the beginning of
 +         * each translation block.  The cost is minimal and only paid for
 +         * -icount, plus it would be very easy to forget doing it in the
 +         * translator. Doing it here means we don't need a gen_io_end() to
 +         * go with gen_io_start().
 +         */
 +        tcg_gen_st_i32(tcg_constant_i32(0), cpu_env,
 +                       offsetof(ArchCPU, parent_obj.can_do_io) -
 +                       offsetof(ArchCPU, env));
 +    }
 +
 +    return icount_start_insn;
 +}
 +
 +static void gen_tb_end(const TranslationBlock *tb, uint32_t cflags,
 +                       TCGOp *icount_start_insn, int num_insns)
 +{
 +    if (cflags & CF_USE_ICOUNT) {
 +        /*
 +         * Update the num_insn immediate parameter now that we know
 +         * the actual insn count.
 +         */
 +        tcg_set_insn_param(icount_start_insn, 2,
 +                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
 +    }
 +
 +    if (tcg_ctx->exitreq_label) {
 +        gen_set_label(tcg_ctx->exitreq_label);
 +        tcg_gen_exit_tb(tb, TB_EXIT_REQUESTED);
 +    }
 +}
 +
  bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
  {
-     /* Suppress goto_tb if requested. */
+     uint64_t z_mask, s_mask, s_mask_old;
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
                       const TranslatorOps *ops, DisasContextBase *db)
  {
      uint32_t cflags = tb_cflags(tb);
 +    TCGOp *icount_start_insn;
      bool plugin_enabled;
      /* Initialize DisasContext */
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
      /* Start translating.  */
 -    gen_tb_start(db->tb);
 +    icount_start_insn = gen_tb_start(cflags);
      ops->tb_start(db, cpu);
      tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
      /* Emit code to exit the TB, as indicated by db->is_jmp.  */
      ops->tb_stop(db, cpu);
 -    gen_tb_end(db->tb, db->num_insns);
 +    gen_tb_end(tb, cflags, icount_start_insn, db->num_insns);
      if (plugin_enabled) {
          plugin_gen_tb_end(cpu);
 --
-.34.1
+.43.0

-[PULL 19/52] tcg: Move TCGHelperInfo and dependencies to tcg/helper-info.h
+[PULL 55/72] softfloat: Add float{16,32,64}_muladd_scalbn
-This will be required outside of tcg-internal.h soon.
+We currently have a flag, float_muladd_halve_result, to scale
 the result by 2**-1.  Extend this to handle arbitrary scaling.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/helper-info.h | 59 +++++++++++++++++++++++++++++++++++++++
+ include/fpu/softfloat.h   |  6 ++++
- tcg/tcg-internal.h        | 47 +------------------------------
+ fpu/softfloat.c           | 58 ++++++++++++++++++++++-----------------
-files changed, 60 insertions(+), 46 deletions(-)
+ fpu/softfloat-parts.c.inc |  7 +++--
- create mode 100644 include/tcg/helper-info.h
+files changed, 44 insertions(+), 27 deletions(-)
-diff --git a/include/tcg/helper-info.h b/include/tcg/helper-info.h
+diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
-new file mode 100644
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/include/fpu/softfloat.h
---- /dev/null
++++ b/include/fpu/softfloat.h
-+++ b/include/tcg/helper-info.h
+@@ -XXX,XX +XXX,XX @@ float16 float16_add(float16, float16, float_status *status);
-@@ -XXX,XX +XXX,XX @@
+ float16 float16_sub(float16, float16, float_status *status);
-+/*
+ float16 float16_mul(float16, float16, float_status *status);
-+ * TCG Helper Infomation Structure
+ float16 float16_muladd(float16, float16, float16, int, float_status *status);
-+ *
++float16 float16_muladd_scalbn(float16, float16, float16,
-+ * Copyright (c) 2023 Linaro Ltd
++                              int, int, float_status *status);
-+ *
+ float16 float16_div(float16, float16, float_status *status);
-+ * SPDX-License-Identifier: GPL-2.0-or-later
+ float16 float16_scalbn(float16, int, float_status *status);
-+ */
+ float16 float16_min(float16, float16, float_status *status);
@@ -XXX,XX +XXX,XX @@ float32 float32_mul(float32, float32, float_status *status);
  float32 float32_div(float32, float32, float_status *status);
  float32 float32_rem(float32, float32, float_status *status);
  float32 float32_muladd(float32, float32, float32, int, float_status *status);
 +float32 float32_muladd_scalbn(float32, float32, float32,
 +                              int, int, float_status *status);
  float32 float32_sqrt(float32, float_status *status);
  float32 float32_exp2(float32, float_status *status);
  float32 float32_log2(float32, float_status *status);
@@ -XXX,XX +XXX,XX @@ float64 float64_mul(float64, float64, float_status *status);
  float64 float64_div(float64, float64, float_status *status);
  float64 float64_rem(float64, float64, float_status *status);
  float64 float64_muladd(float64, float64, float64, int, float_status *status);
 +float64 float64_muladd_scalbn(float64, float64, float64,
 +                              int, int, float_status *status);
  float64 float64_sqrt(float64, float_status *status);
  float64 float64_log2(float64, float_status *status);
  FloatRelation float64_compare(float64, float64, float_status *status);
 diff --git a/fpu/softfloat.c b/fpu/softfloat.c
 index XXXXXXX..XXXXXXX 100644
 --- a/fpu/softfloat.c
 +++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
  #define parts_mul(A, B, S) \
      PARTS_GENERIC_64_128(mul, A)(A, B, S)
 -static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
 -                                    FloatParts64 *c, int flags,
 -                                    float_status *s);
 -static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
 -                                      FloatParts128 *c, int flags,
 -                                      float_status *s);
 +static FloatParts64 *parts64_muladd_scalbn(FloatParts64 *a, FloatParts64 *b,
 +                                           FloatParts64 *c, int scale,
 +                                           int flags, float_status *s);
 +static FloatParts128 *parts128_muladd_scalbn(FloatParts128 *a, FloatParts128 *b,
 +                                             FloatParts128 *c, int scale,
 +                                             int flags, float_status *s);
 -#define parts_muladd(A, B, C, Z, S) \
 -    PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
 +#define parts_muladd_scalbn(A, B, C, Z, Y, S) \
 +    PARTS_GENERIC_64_128(muladd_scalbn, A)(A, B, C, Z, Y, S)
  static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
                                   float_status *s);
@@ -XXX,XX +XXX,XX @@ floatx80_mul(floatx80 a, floatx80 b, float_status *status)
   * Fused multiply-add
   */
 -float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
 -                                    int flags, float_status *status)
 +float16 QEMU_FLATTEN
 +float16_muladd_scalbn(float16 a, float16 b, float16 c,
 +                      int scale, int flags, float_status *status)
  {
      FloatParts64 pa, pb, pc, *pr;
      float16_unpack_canonical(&pa, a, status);
      float16_unpack_canonical(&pb, b, status);
      float16_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
      return float16_round_pack_canonical(pr, status);
  }
 -static float32 QEMU_SOFTFLOAT_ATTR
 -soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
 -                float_status *status)
 +float16 float16_muladd(float16 a, float16 b, float16 c,
 +                       int flags, float_status *status)
 +{
 +    return float16_muladd_scalbn(a, b, c, 0, flags, status);
 +}
 +
-+#ifndef TCG_HELPER_INFO_H
++float32 QEMU_SOFTFLOAT_ATTR
-+#define TCG_HELPER_INFO_H
++float32_muladd_scalbn(float32 a, float32 b, float32 c,
 +                      int scale, int flags, float_status *status)
  {
      FloatParts64 pa, pb, pc, *pr;
      float32_unpack_canonical(&pa, a, status);
      float32_unpack_canonical(&pb, b, status);
      float32_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
      return float32_round_pack_canonical(pr, status);
  }
 -static float64 QEMU_SOFTFLOAT_ATTR
 -soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
 -                float_status *status)
 +float64 QEMU_SOFTFLOAT_ATTR
 +float64_muladd_scalbn(float64 a, float64 b, float64 c,
 +                      int scale, int flags, float_status *status)
  {
      FloatParts64 pa, pb, pc, *pr;
      float64_unpack_canonical(&pa, a, status);
      float64_unpack_canonical(&pb, b, status);
      float64_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
      return float64_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
      return ur.s;
   soft:
 -    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
 +    return float32_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
  }
  float64 QEMU_FLATTEN
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
      return ur.s;
   soft:
 -    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
 +    return float64_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
  }
  float64 float64r32_muladd(float64 a, float64 b, float64 c,
@@ -XXX,XX +XXX,XX @@ float64 float64r32_muladd(float64 a, float64 b, float64 c,
      float64_unpack_canonical(&pa, a, status);
      float64_unpack_canonical(&pb, b, status);
      float64_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
      return float64r32_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
      bfloat16_unpack_canonical(&pa, a, status);
      bfloat16_unpack_canonical(&pb, b, status);
      bfloat16_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
      return bfloat16_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
      float128_unpack_canonical(&pa, a, status);
      float128_unpack_canonical(&pb, b, status);
      float128_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
      return float128_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ float32 float32_exp2(float32 a, float_status *status)
      float64_unpack_canonical(&rp, float64_one, status);
      for (i = 0 ; i < 15 ; i++) {
 +
-+#ifdef CONFIG_TCG_INTERPRETER
+         float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status);
-+#include <ffi.h>
+-        rp = *parts_muladd(&tp, &xnp, &rp, 0, status);
-+#endif
++        rp = *parts_muladd_scalbn(&tp, &xnp, &rp, 0, 0, status);
-+
+         xnp = *parts_mul(&xnp, &xp, status);
-+/*
+     }
-+ * Describe the calling convention of a given argument type.
-+ */
+diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
 +typedef enum {
 +    TCG_CALL_RET_NORMAL,         /* by registers */
 +    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
 +    TCG_CALL_RET_BY_VEC,         /* for i128, by vector register */
 +} TCGCallReturnKind;
 +
 +typedef enum {
 +    TCG_CALL_ARG_NORMAL,         /* by registers (continuing onto stack) */
 +    TCG_CALL_ARG_EVEN,           /* like normal, but skipping odd slots */
 +    TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
 +    TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
 +    TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
 +    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
 +    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
 +} TCGCallArgumentKind;
 +
 +typedef struct TCGCallArgumentLoc {
 +    TCGCallArgumentKind kind    : 8;
 +    unsigned arg_slot           : 8;
 +    unsigned ref_slot           : 8;
 +    unsigned arg_idx            : 4;
 +    unsigned tmp_subindex       : 2;
 +} TCGCallArgumentLoc;
 +
 +typedef struct TCGHelperInfo {
 +    void *func;
 +    const char *name;
 +#ifdef CONFIG_TCG_INTERPRETER
 +    ffi_cif *cif;
 +#endif
 +    unsigned typemask           : 32;
 +    unsigned flags              : 8;
 +    unsigned nr_in              : 8;
 +    unsigned nr_out             : 8;
 +    TCGCallReturnKind out_kind  : 8;
 +
 +    /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
 +    TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
 +} TCGHelperInfo;
 +
 +#endif /* TCG_HELPER_INFO_H */
 diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-internal.h
+--- a/fpu/softfloat-parts.c.inc
-+++ b/tcg/tcg-internal.h
++++ b/fpu/softfloat-parts.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
- #ifndef TCG_INTERNAL_H
+  * Requires A and C extracted into a double-sized structure to provide the
- #define TCG_INTERNAL_H
+  * extra space for the widening multiply.
+  */
--#ifdef CONFIG_TCG_INTERPRETER
+-static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
--#include <ffi.h>
+-                                   FloatPartsN *c, int flags, float_status *s)
--#endif
++static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
-+#include "tcg/helper-info.h"
++                                          FloatPartsN *c, int scale,
++                                          int flags, float_status *s)
- #define TCG_HIGHWATER 1024
+ {
+     int ab_mask, abc_mask;
--/*
+     FloatPartsW p_widen, c_widen;
-- * Describe the calling convention of a given argument type.
+@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
-- */
+     a->exp = p_widen.exp;
--typedef enum {
--    TCG_CALL_RET_NORMAL,         /* by registers */
+  return_normal:
--    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
++    /* TODO: Replace all use of float_muladd_halve_result with scale. */
--    TCG_CALL_RET_BY_VEC,         /* for i128, by vector register */
+     if (flags & float_muladd_halve_result) {
--} TCGCallReturnKind;
+         a->exp -= 1;
--
+     }
--typedef enum {
++    a->exp += scale;
--    TCG_CALL_ARG_NORMAL,         /* by registers (continuing onto stack) */
+  finish_sign:
--    TCG_CALL_ARG_EVEN,           /* like normal, but skipping odd slots */
+     if (flags & float_muladd_negate_result) {
--    TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
+         a->sign ^= 1;
 -    TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
 -    TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
 -    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
 -    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
 -} TCGCallArgumentKind;
 -
 -typedef struct TCGCallArgumentLoc {
 -    TCGCallArgumentKind kind    : 8;
 -    unsigned arg_slot           : 8;
 -    unsigned ref_slot           : 8;
 -    unsigned arg_idx            : 4;
 -    unsigned tmp_subindex       : 2;
 -} TCGCallArgumentLoc;
 -
 -typedef struct TCGHelperInfo {
 -    void *func;
 -    const char *name;
 -#ifdef CONFIG_TCG_INTERPRETER
 -    ffi_cif *cif;
 -#endif
 -    unsigned typemask           : 32;
 -    unsigned flags              : 8;
 -    unsigned nr_in              : 8;
 -    unsigned nr_out             : 8;
 -    TCGCallReturnKind out_kind  : 8;
 -
 -    /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
 -    TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
 -} TCGHelperInfo;
 -
  extern TCGContext tcg_init_ctx;
  extern TCGContext **tcg_ctxs;
  extern unsigned int tcg_cur_ctxs;
 --
-.34.1
+.43.0

-[PULL 45/52] tcg: Move env defines out of NEED_CPU_H in helper-head.h
+[PULL 56/72] target/arm: Use float*_muladd_scalbn
-Since the change to CPUArchState, we have a common typedef
+Use the scalbn interface instead of float_muladd_halve_result.
 that can always be used.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-head.h | 6 +++---
+ target/arm/tcg/helper-a64.c | 6 +++---
 file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
+diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-head.h
+--- a/target/arm/tcg/helper-a64.c
-+++ b/include/exec/helper-head.h
++++ b/target/arm/tcg/helper-a64.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, float_status *fpst)
- #define dh_alias_f64 i64
+         (float16_is_infinity(b) && float16_is_zero(a))) {
- #define dh_alias_ptr ptr
+         return float16_one_point_five;
- #define dh_alias_cptr ptr
+     }
-+#define dh_alias_env ptr
+-    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
- #define dh_alias_void void
++    return float16_muladd_scalbn(a, b, float16_three, -1, 0, fpst);
- #define dh_alias_noreturn noreturn
+ }
- #define dh_alias(t) glue(dh_alias_, t)
-@@ -XXX,XX +XXX,XX @@
+ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
- #define dh_ctype_f64 float64
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
- #define dh_ctype_ptr void *
+         (float32_is_infinity(b) && float32_is_zero(a))) {
- #define dh_ctype_cptr const void *
+         return float32_one_point_five;
-+#define dh_ctype_env CPUArchState *
+     }
- #define dh_ctype_void void
+-    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
- #define dh_ctype_noreturn G_NORETURN void
++    return float32_muladd_scalbn(a, b, float32_three, -1, 0, fpst);
- #define dh_ctype(t) dh_ctype_##t
+ }
-@@ -XXX,XX +XXX,XX @@
- #  endif
+ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
- # endif
+@@ -XXX,XX +XXX,XX @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
- # define dh_ctype_tl target_ulong
+         (float64_is_infinity(b) && float64_is_zero(a))) {
--# define dh_alias_env ptr
+         return float64_one_point_five;
--# define dh_ctype_env CPUArchState *
+     }
--# define dh_typecode_env dh_typecode_ptr
+-    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
- #endif
++    return float64_muladd_scalbn(a, b, float64_three, -1, 0, fpst);
+ }
- /* We can't use glue() here because it falls foul of C preprocessor
-@@ -XXX,XX +XXX,XX @@
+ /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
  #define dh_typecode_f32 dh_typecode_i32
  #define dh_typecode_f64 dh_typecode_i64
  #define dh_typecode_cptr dh_typecode_ptr
 +#define dh_typecode_env dh_typecode_ptr
  #define dh_typecode(t) dh_typecode_##t
  #define dh_callflag_i32  0
 --
-.34.1
+.43.0

-[PULL 17/52] target/hexagon: Include helper-gen.h where needed
+[PULL 57/72] target/sparc: Use float*_muladd_scalbn
-This had been included via tcg-op-common.h via tcg-op.h,
+Use the scalbn interface instead of float_muladd_halve_result.
 but that is going away.  In idef-parser.y, shuffle some
 tcg related includes into a more logical order.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/hexagon/genptr.c                  | 1 +
+ target/sparc/helper.h     |  4 +-
- target/hexagon/translate.c               | 1 +
+ target/sparc/fop_helper.c |  8 ++--
- target/hexagon/idef-parser/idef-parser.y | 3 ++-
+ target/sparc/translate.c  | 80 +++++++++++++++++++++++----------------
-files changed, 4 insertions(+), 1 deletion(-)
+files changed, 54 insertions(+), 38 deletions(-)
-diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
+diff --git a/target/sparc/helper.h b/target/sparc/helper.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/genptr.c
+--- a/target/sparc/helper.h
-+++ b/target/hexagon/genptr.c
++++ b/target/sparc/helper.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(faddd, TCG_CALL_NO_WG, f64, env, f64, f64)
- #include "internal.h"
+ DEF_HELPER_FLAGS_3(fsubd, TCG_CALL_NO_WG, f64, env, f64, f64)
- #include "tcg/tcg-op.h"
+ DEF_HELPER_FLAGS_3(fmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
- #include "tcg/tcg-op-gvec.h"
+ DEF_HELPER_FLAGS_3(fdivd, TCG_CALL_NO_WG, f64, env, f64, f64)
-+#include "exec/helper-gen.h"
+-DEF_HELPER_FLAGS_5(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, i32)
- #include "insn.h"
++DEF_HELPER_FLAGS_6(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, s32, i32)
- #include "opcodes.h"
+ DEF_HELPER_FLAGS_3(fnaddd, TCG_CALL_NO_WG, f64, env, f64, f64)
- #include "translate.h"
+ DEF_HELPER_FLAGS_3(fnmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
-diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(fadds, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fsubs, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fdivs, TCG_CALL_NO_WG, f32, env, f32, f32)
 -DEF_HELPER_FLAGS_5(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, i32)
 +DEF_HELPER_FLAGS_6(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, s32, i32)
  DEF_HELPER_FLAGS_3(fnadds, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fnmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
 diff --git a/target/sparc/fop_helper.c b/target/sparc/fop_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/translate.c
+--- a/target/sparc/fop_helper.c
-+++ b/target/hexagon/translate.c
++++ b/target/sparc/fop_helper.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ Int128 helper_fsqrtq(CPUSPARCState *env, Int128 src)
- #include "cpu.h"
+ }
- #include "tcg/tcg-op.h"
- #include "tcg/tcg-op-gvec.h"
+ float32 helper_fmadds(CPUSPARCState *env, float32 s1,
-+#include "exec/helper-gen.h"
+-                      float32 s2, float32 s3, uint32_t op)
- #include "exec/cpu_ldst.h"
++                      float32 s2, float32 s3, int32_t sc, uint32_t op)
- #include "exec/log.h"
+ {
- #include "internal.h"
+-    float32 ret = float32_muladd(s1, s2, s3, op, &env->fp_status);
-diff --git a/target/hexagon/idef-parser/idef-parser.y b/target/hexagon/idef-parser/idef-parser.y
++    float32 ret = float32_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
      check_ieee_exceptions(env, GETPC());
      return ret;
  }
  float64 helper_fmaddd(CPUSPARCState *env, float64 s1,
 -                      float64 s2, float64 s3, uint32_t op)
 +                      float64 s2, float64 s3, int32_t sc, uint32_t op)
  {
 -    float64 ret = float64_muladd(s1, s2, s3, op, &env->fp_status);
 +    float64 ret = float64_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
      check_ieee_exceptions(env, GETPC());
      return ret;
  }
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/idef-parser/idef-parser.y
+--- a/target/sparc/translate.c
-+++ b/target/hexagon/idef-parser/idef-parser.y
++++ b/target/sparc/translate.c
-@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
+@@ -XXX,XX +XXX,XX @@ static void gen_op_fabsq(TCGv_i128 dst, TCGv_i128 src)
-     fputs("#include \"qemu/log.h\"\n", output_file);
-     fputs("#include \"cpu.h\"\n", output_file);
+ static void gen_op_fmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
-     fputs("#include \"internal.h\"\n", output_file);
+ {
-+    fputs("#include \"tcg/tcg.h\"\n", output_file);
+-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
-     fputs("#include \"tcg/tcg-op.h\"\n", output_file);
++    TCGv_i32 z = tcg_constant_i32(0);
-+    fputs("#include \"exec/helper-gen.h\"\n", output_file);
++    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, z);
-     fputs("#include \"insn.h\"\n", output_file);
+ }
-     fputs("#include \"opcodes.h\"\n", output_file);
-     fputs("#include \"translate.h\"\n", output_file);
+ static void gen_op_fmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
-     fputs("#define QEMU_GENERATE\n", output_file);
+ {
-     fputs("#include \"genptr.h\"\n", output_file);
+-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
--    fputs("#include \"tcg/tcg.h\"\n", output_file);
++    TCGv_i32 z = tcg_constant_i32(0);
-     fputs("#include \"macros.h\"\n", output_file);
++    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, z);
-     fprintf(output_file, "#include \"%s\"\n", argv[ARG_INDEX_EMITTER_H]);
+ }
  static void gen_op_fmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    int op = float_muladd_negate_c;
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    int op = float_muladd_negate_c;
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    int op = float_muladd_negate_c | float_muladd_negate_result;
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
 +                                   float_muladd_negate_result);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    int op = float_muladd_negate_c | float_muladd_negate_result;
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
 +                                   float_muladd_negate_result);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    int op = float_muladd_negate_result;
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    int op = float_muladd_negate_result;
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
  }
  /* Use muladd to compute (1 * src1) + src2 / 2 with one rounding. */
  static void gen_op_fhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
  {
 -    TCGv_i32 one = tcg_constant_i32(float32_one);
 -    int op = float_muladd_halve_result;
 -    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i32 fone = tcg_constant_i32(float32_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(0);
 +    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
  {
 -    TCGv_i64 one = tcg_constant_i64(float64_one);
 -    int op = float_muladd_halve_result;
 -    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i64 fone = tcg_constant_i64(float64_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(0);
 +    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
  }
  /* Use muladd to compute (1 * src1) - src2 / 2 with one rounding. */
  static void gen_op_fhsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
  {
 -    TCGv_i32 one = tcg_constant_i32(float32_one);
 -    int op = float_muladd_negate_c | float_muladd_halve_result;
 -    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i32 fone = tcg_constant_i32(float32_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fhsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
  {
 -    TCGv_i64 one = tcg_constant_i64(float64_one);
 -    int op = float_muladd_negate_c | float_muladd_halve_result;
 -    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i64 fone = tcg_constant_i64(float64_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
  }
  /* Use muladd to compute -((1 * src1) + src2 / 2) with one rounding. */
  static void gen_op_fnhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
  {
 -    TCGv_i32 one = tcg_constant_i32(float32_one);
 -    int op = float_muladd_negate_result | float_muladd_halve_result;
 -    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i32 fone = tcg_constant_i32(float32_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fnhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
  {
 -    TCGv_i64 one = tcg_constant_i64(float64_one);
 -    int op = float_muladd_negate_result | float_muladd_halve_result;
 -    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i64 fone = tcg_constant_i64(float64_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fpexception_im(DisasContext *dc, int ftt)
 --
-.34.1
+.43.0

-[PULL 16/52] target/arm: Include helper-gen.h in translator.h
+[PULL 58/72] softfloat: Remove float_muladd_halve_result
-This had been included via tcg-op-common.h via tcg-op.h,
+All uses have been convered to float*_muladd_scalbn.
 but that is going away.
 It is needed for inlines within translator.h, so we might as well
 do it there and not individually in each translator c file.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/tcg/translate.h     | 1 +
+ include/fpu/softfloat.h   | 3 ---
- target/arm/tcg/translate-a64.c | 2 --
+ fpu/softfloat.c           | 6 ------
- target/arm/tcg/translate-sme.c | 1 -
+ fpu/softfloat-parts.c.inc | 4 ----
- target/arm/tcg/translate-sve.c | 2 --
+files changed, 13 deletions(-)
  target/arm/tcg/translate.c     | 2 --
 files changed, 1 insertion(+), 7 deletions(-)
-diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
+diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/tcg/translate.h
+--- a/include/fpu/softfloat.h
-+++ b/target/arm/tcg/translate.h
++++ b/include/fpu/softfloat.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
- #define TARGET_ARM_TRANSLATE_H
+ | Using these differs from negating an input or output before calling
+ | the muladd function in that this means that a NaN doesn't have its
- #include "exec/translator.h"
+ | sign bit inverted before it is propagated.
-+#include "exec/helper-gen.h"
+-| We also support halving the result before rounding, as a special
- #include "internals.h"
+-| case to support the ARM fused-sqrt-step instruction FRSQRTS.
+ *----------------------------------------------------------------------------*/
+ enum {
-diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
+     float_muladd_negate_c = 1,
      float_muladd_negate_product = 2,
      float_muladd_negate_result = 4,
 -    float_muladd_halve_result = 8,
  };
  /*----------------------------------------------------------------------------
 diff --git a/fpu/softfloat.c b/fpu/softfloat.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/tcg/translate-a64.c
+--- a/fpu/softfloat.c
-+++ b/target/arm/tcg/translate-a64.c
++++ b/fpu/softfloat.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
- #include "qemu/host-utils.h"
+     if (unlikely(!can_use_fpu(s))) {
- #include "semihosting/semihost.h"
+         goto soft;
- #include "exec/gen-icount.h"
+     }
--#include "exec/helper-proto.h"
+-    if (unlikely(flags & float_muladd_halve_result)) {
--#include "exec/helper-gen.h"
+-        goto soft;
- #include "exec/log.h"
+-    }
- #include "cpregs.h"
- #include "translate-a64.h"
+     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
-diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
+     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
      if (unlikely(!can_use_fpu(s))) {
          goto soft;
      }
 -    if (unlikely(flags & float_muladd_halve_result)) {
 -        goto soft;
 -    }
      float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
      if (unlikely(!f64_is_zon3(ua, ub, uc))) {
 diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/tcg/translate-sme.c
+--- a/fpu/softfloat-parts.c.inc
-+++ b/target/arm/tcg/translate-sme.c
++++ b/fpu/softfloat-parts.c.inc
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
- #include "tcg/tcg-op-gvec.h"
+     a->exp = p_widen.exp;
- #include "tcg/tcg-gvec-desc.h"
- #include "translate.h"
+  return_normal:
--#include "exec/helper-gen.h"
+-    /* TODO: Replace all use of float_muladd_halve_result with scale. */
- #include "translate-a64.h"
+-    if (flags & float_muladd_halve_result) {
- #include "fpu/softfloat.h"
+-        a->exp -= 1;
+-    }
-diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
+     a->exp += scale;
-index XXXXXXX..XXXXXXX 100644
+  finish_sign:
---- a/target/arm/tcg/translate-sve.c
+     if (flags & float_muladd_negate_result) {
 +++ b/target/arm/tcg/translate-sve.c
@@ -XXX,XX +XXX,XX @@
  #include "arm_ldst.h"
  #include "translate.h"
  #include "internals.h"
 -#include "exec/helper-proto.h"
 -#include "exec/helper-gen.h"
  #include "exec/log.h"
  #include "translate-a64.h"
  #include "fpu/softfloat.h"
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/bitops.h"
  #include "arm_ldst.h"
  #include "semihosting/semihost.h"
 -#include "exec/helper-proto.h"
 -#include "exec/helper-gen.h"
  #include "exec/log.h"
  #include "cpregs.h"
 --
-.34.1
+.43.0

-[PULL 38/52] accel/tcg: Move translator_fake_ldb out of line
+[PULL 59/72] softfloat: Add float_round_nearest_even_max
-This is used by exactly one host in extraordinary circumstances.
+This rounding mode is used by Hexagon.
 This means that translator.h need not include plugin-gen.h;
 translator.c already includes plugin-gen.h.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/translator.h | 8 +-------
+ include/fpu/softfloat-types.h | 2 ++
- accel/tcg/translator.c    | 5 +++++
+ fpu/softfloat-parts.c.inc     | 3 +++
-files changed, 6 insertions(+), 7 deletions(-)
+files changed, 5 insertions(+)
-diff --git a/include/exec/translator.h b/include/exec/translator.h
+diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/translator.h
+--- a/include/fpu/softfloat-types.h
-+++ b/include/exec/translator.h
++++ b/include/fpu/softfloat-types.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef enum __attribute__((__packed__)) {
- #include "qemu/bswap.h"
+     float_round_to_odd       = 5,
- #include "exec/exec-all.h"
+     /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
- #include "exec/cpu_ldst.h"
+     float_round_to_odd_inf   = 6,
--#include "exec/plugin-gen.h"
++    /* Not an IEEE rounding mode: round to nearest even, overflow to max */
- #include "exec/translate-all.h"
++    float_round_nearest_even_max = 7,
- #include "tcg/tcg.h"
+ } FloatRoundMode;
@@ -XXX,XX +XXX,XX @@ translator_ldq_swap(CPUArchState *env, DisasContextBase *db,
   * re-synthesised for s390x "ex"). It ensures we update other areas of
   * the translator with details of the executed instruction.
   */
 -
 -static inline void translator_fake_ldb(uint8_t insn8, abi_ptr pc)
 -{
 -    plugin_insn_append(pc, &insn8, sizeof(insn8));
 -}
 -
 +void translator_fake_ldb(uint8_t insn8, abi_ptr pc);
  /*
-  * Return whether addr is on the same page as where disassembly started.
+diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
 diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translator.c
+--- a/fpu/softfloat-parts.c.inc
-+++ b/accel/tcg/translator.c
++++ b/fpu/softfloat-parts.c.inc
-@@ -XXX,XX +XXX,XX @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
+@@ -XXX,XX +XXX,XX @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
-     plugin_insn_append(pc, &plug, sizeof(ret));
+     int exp, flags = 0;
-     return ret;
- }
+     switch (s->float_rounding_mode) {
-+
++    case float_round_nearest_even_max:
-+void translator_fake_ldb(uint8_t insn8, abi_ptr pc)
++        overflow_norm = true;
-+{
++        /* fall through */
-+    plugin_insn_append(pc, &insn8, sizeof(insn8));
+     case float_round_nearest_even:
-+}
+         if (N > 64 && frac_lsb == 0) {
              inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
 --
-.34.1
+.43.0

-[PULL 18/52] tcg: Remove outdated comments in helper-head.h
+[PULL 60/72] softfloat: Add float_muladd_suppress_add_product_zero
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Certain Hexagon instructions suppress changes to the result
 when the product of fma() is a true zero.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/helper-head.h | 18 +++---------------
+ include/fpu/softfloat.h   | 5 +++++
-file changed, 3 insertions(+), 15 deletions(-)
+ fpu/softfloat.c           | 3 +++
  fpu/softfloat-parts.c.inc | 4 +++-
 files changed, 11 insertions(+), 1 deletion(-)
-diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
+diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/helper-head.h
+--- a/include/fpu/softfloat.h
-+++ b/include/exec/helper-head.h
++++ b/include/fpu/softfloat.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
--/* Helper file for declaring TCG helper functions.
+ | Using these differs from negating an input or output before calling
--   Used by other helper files.
+ | the muladd function in that this means that a NaN doesn't have its
--
+ | sign bit inverted before it is propagated.
--   Targets should use DEF_HELPER_N and DEF_HELPER_FLAGS_N to declare helper
++|
--   functions.  Names should be specified without the helper_ prefix, and
++| With float_muladd_suppress_add_product_zero, if A or B is zero
--   the return and argument types specified.  3 basic types are understood
++| such that the product is a true zero, then return C without addition.
--   (i32, i64 and ptr).  Additional aliases are provided for convenience and
++| This preserves the sign of C when C is +/- 0.  Used for Hexagon.
--   to match the types used by the C helper implementation.
+ *----------------------------------------------------------------------------*/
--
+ enum {
--   The target helper.h should be included in all files that use/define
+     float_muladd_negate_c = 1,
--   helper functions.  THis will ensure that function prototypes are
+     float_muladd_negate_product = 2,
--   consistent.  In addition it should be included an extra two times for
+     float_muladd_negate_result = 4,
--   helper.c, defining:
++    float_muladd_suppress_add_product_zero = 8,
--    GEN_HELPER 1 to produce op generation functions (gen_helper_*)
+ };
--    GEN_HELPER 2 to do runtime registration helper functions.
-+/*
+ /*----------------------------------------------------------------------------
-+ * Helper file for declaring TCG helper functions.
+diff --git a/fpu/softfloat.c b/fpu/softfloat.c
-+ * Used by other helper files.
+index XXXXXXX..XXXXXXX 100644
-  */
+--- a/fpu/softfloat.c
++++ b/fpu/softfloat.c
- #ifndef EXEC_HELPER_HEAD_H
+@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
      if (unlikely(!can_use_fpu(s))) {
          goto soft;
      }
 +    if (unlikely(flags & float_muladd_suppress_add_product_zero)) {
 +        goto soft;
 +    }
      float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
      if (unlikely(!f32_is_zon3(ua, ub, uc))) {
 diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/fpu/softfloat-parts.c.inc
 +++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
              goto return_normal;
          }
          if (c->cls == float_class_zero) {
 -            if (a->sign != c->sign) {
 +            if (flags & float_muladd_suppress_add_product_zero) {
 +                a->sign = c->sign;
 +            } else if (a->sign != c->sign) {
                  goto return_sub_zero;
              }
              goto return_zero;
 --
-.34.1
+.43.0

-[PULL 14/52] tcg: Move TCGv, dup_const_tl definitions to tcg-op.h
+[PULL 61/72] target/hexagon: Use float32_mul in helper_sfmpy
-These two items are the last uses of TARGET_LONG_BITS within tcg.h,
+There are no special cases for this instruction.
-and are more in common with the other "_tl" definitions within that file.
+Remove internal_mpyf as unused.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op.h        | 15 ++++++++++++++-
+ target/hexagon/fma_emu.h   | 1 -
- include/tcg/tcg.h           | 19 -------------------
+ target/hexagon/fma_emu.c   | 8 --------
- target/mips/tcg/translate.h |  1 +
+ target/hexagon/op_helper.c | 2 +-
-files changed, 15 insertions(+), 20 deletions(-)
+files changed, 1 insertion(+), 10 deletions(-)
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
+diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/target/hexagon/fma_emu.h
-+++ b/include/tcg/tcg-op.h
++++ b/target/hexagon/fma_emu.h
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_plugin_cb_end(void)
+@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32);
  float32 infinite_float32(uint8_t sign);
  float32 internal_fmafx(float32 a, float32 b, float32 c,
                         int scale, float_status *fp_status);
 -float32 internal_mpyf(float32 a, float32 b, float_status *fp_status);
  float64 internal_mpyhh(float64 a, float64 b,
                         unsigned long long int accumulated,
                         float_status *fp_status);
 diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/fma_emu.c
 +++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
      return accum_round_float32(result, fp_status);
  }
- #if TARGET_LONG_BITS == 32
+-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status)
-+typedef TCGv_i32 TCGv;
+-{
- #define tcg_temp_new() tcg_temp_new_i32()
+-    if (float32_is_zero(a) || float32_is_zero(b)) {
- #define tcg_global_mem_new tcg_global_mem_new_i32
+-        return float32_mul(a, b, fp_status);
- #define tcg_temp_free tcg_temp_free_i32
+-    }
- #define tcgv_tl_temp tcgv_i32_temp
+-    return internal_fmafx(a, b, float32_zero, 0, fp_status);
- #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
+-}
- #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
+-
--#else
+ float64 internal_mpyhh(float64 a, float64 b,
-+#elif TARGET_LONG_BITS == 64
+                       unsigned long long int accumulated,
-+typedef TCGv_i64 TCGv;
+                       float_status *fp_status)
- #define tcg_temp_new() tcg_temp_new_i64()
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
  #define tcg_global_mem_new tcg_global_mem_new_i64
  #define tcg_temp_free tcg_temp_free_i64
  #define tcgv_tl_temp tcgv_i64_temp
  #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
  #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
 +#else
 +#error Unhandled TARGET_LONG_BITS value
  #endif
  void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
  #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i64
  #define tcg_gen_dup_tl_vec  tcg_gen_dup_i64_vec
  #define tcg_gen_dup_tl tcg_gen_dup_i64
 +#define dup_const_tl dup_const
  #else
  #define tcg_gen_movi_tl tcg_gen_movi_i32
  #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -XXX,XX +XXX,XX @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
  #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i32
  #define tcg_gen_dup_tl_vec  tcg_gen_dup_i32_vec
  #define tcg_gen_dup_tl tcg_gen_dup_i32
 +
 +#define dup_const_tl(VECE, C)                                      \
 +    (__builtin_constant_p(VECE)                                    \
 +     ? (  (VECE) == MO_8  ? 0x01010101ul * (uint8_t)(C)            \
 +        : (VECE) == MO_16 ? 0x00010001ul * (uint16_t)(C)           \
 +        : (VECE) == MO_32 ? 0x00000001ul * (uint32_t)(C)           \
 +        : (qemu_build_not_reached_always(), 0))                    \
 +     :  (target_long)dup_const(VECE, C))
  #endif
  #if UINTPTR_MAX == UINT32_MAX
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/target/hexagon/op_helper.c
-+++ b/include/tcg/tcg.h
++++ b/target/hexagon/op_helper.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGv_i128_d *TCGv_i128;
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sfmpy)(CPUHexagonState *env, float32 RsV, float32 RtV)
- typedef struct TCGv_ptr_d *TCGv_ptr;
+ {
- typedef struct TCGv_vec_d *TCGv_vec;
+     float32 RdV;
- typedef TCGv_ptr TCGv_env;
+     arch_fpop_start(env);
--#if TARGET_LONG_BITS == 32
+-    RdV = internal_mpyf(RsV, RtV, &env->fp_status);
--#define TCGv TCGv_i32
++    RdV = float32_mul(RsV, RtV, &env->fp_status);
--#elif TARGET_LONG_BITS == 64
+     arch_fpop_end(env);
--#define TCGv TCGv_i64
+     return RdV;
--#else
+ }
 -#error Unhandled TARGET_LONG_BITS value
 -#endif
  /* call flags */
  /* Helper does not read globals (either directly or through an exception). It
@@ -XXX,XX +XXX,XX @@ uint64_t dup_const(unsigned vece, uint64_t c);
          : (qemu_build_not_reached_always(), 0))                    \
       : dup_const(VECE, C))
 -#if TARGET_LONG_BITS == 64
 -# define dup_const_tl  dup_const
 -#else
 -# define dup_const_tl(VECE, C)                                     \
 -    (__builtin_constant_p(VECE)                                    \
 -     ? (  (VECE) == MO_8  ? 0x01010101ul * (uint8_t)(C)            \
 -        : (VECE) == MO_16 ? 0x00010001ul * (uint16_t)(C)           \
 -        : (VECE) == MO_32 ? 0x00000001ul * (uint32_t)(C)           \
 -        : (qemu_build_not_reached_always(), 0))                    \
 -     :  (target_long)dup_const(VECE, C))
 -#endif
 -
  #ifdef CONFIG_DEBUG_TCG
  void tcg_assert_listed_vecop(TCGOpcode);
  #else
 diff --git a/target/mips/tcg/translate.h b/target/mips/tcg/translate.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/translate.h
 +++ b/target/mips/tcg/translate.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/log.h"
  #include "exec/translator.h"
 +#include "tcg/tcg-op.h"
  #define MIPS_DEBUG_DISAS 0
 --
-.34.1
+.43.0

-[PULL 13/52] tcg: Split out tcg/oversized-guest.h
+[PULL 62/72] target/hexagon: Use float32_muladd for helper_sffma
-Move a use of TARGET_LONG_BITS out of tcg/tcg.h.
+There are no special cases for this instruction.
 Include the new file only where required.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu_ldst.h       |  3 +--
+ target/hexagon/op_helper.c | 2 +-
- include/tcg/oversized-guest.h | 23 +++++++++++++++++++++++
+file changed, 1 insertion(+), 1 deletion(-)
  include/tcg/tcg.h             |  9 ---------
  accel/tcg/cputlb.c            |  1 +
  accel/tcg/tcg-all.c           |  1 +
  target/arm/ptw.c              |  1 +
  target/riscv/cpu_helper.c     |  1 +
 files changed, 28 insertions(+), 11 deletions(-)
  create mode 100644 include/tcg/oversized-guest.h
-diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu_ldst.h
+--- a/target/hexagon/op_helper.c
-+++ b/include/exec/cpu_ldst.h
++++ b/target/hexagon/op_helper.c
-@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV,
+                       float32 RsV, float32 RtV)
  #else
 -/* Needed for TCG_OVERSIZED_GUEST */
 -#include "tcg/tcg.h"
 +#include "tcg/oversized-guest.h"
  static inline target_ulong tlb_read_idx(const CPUTLBEntry *entry,
                                          MMUAccessType access_type)
 diff --git a/include/tcg/oversized-guest.h b/include/tcg/oversized-guest.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/tcg/oversized-guest.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define TCG_OVERSIZED_GUEST
 + * Copyright (c) 2008 Fabrice Bellard
 + */
 +
 +#ifndef EXEC_TCG_OVERSIZED_GUEST_H
 +#define EXEC_TCG_OVERSIZED_GUEST_H
 +
 +#include "tcg-target-reg-bits.h"
 +#include "cpu-param.h"
 +
 +/*
 + * Oversized TCG guests make things like MTTCG hard
 + * as we can't use atomics for cputlb updates.
 + */
 +#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
 +#define TCG_OVERSIZED_GUEST 1
 +#else
 +#define TCG_OVERSIZED_GUEST 0
 +#endif
 +
 +#endif
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t tcg_target_ulong;
  #error unsupported
  #endif
 -/* Oversized TCG guests make things like MTTCG hard
 - * as we can't use atomics for cputlb updates.
 - */
 -#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
 -#define TCG_OVERSIZED_GUEST 1
 -#else
 -#define TCG_OVERSIZED_GUEST 0
 -#endif
 -
  #if TCG_TARGET_NB_REGS <= 32
  typedef uint32_t TCGRegSet;
  #elif TCG_TARGET_NB_REGS <= 64
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/plugin-memory.h"
  #endif
  #include "tcg/tcg-ldst.h"
 +#include "tcg/oversized-guest.h"
  #include "exec/helper-proto.h"
  /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-all.c
 +++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/replay-core.h"
  #include "sysemu/cpu-timers.h"
  #include "tcg/tcg.h"
 +#include "tcg/oversized-guest.h"
  #include "qapi/error.h"
  #include "qemu/error-report.h"
  #include "qemu/accel.h"
 diff --git a/target/arm/ptw.c b/target/arm/ptw.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/ptw.c
 +++ b/target/arm/ptw.c
@@ -XXX,XX +XXX,XX @@
  #include "cpu.h"
  #include "internals.h"
  #include "idau.h"
 +#include "tcg/oversized-guest.h"
  typedef struct S1Translate {
 diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu_helper.c
 +++ b/target/riscv/cpu_helper.c
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/cpu-timers.h"
  #include "cpu_bits.h"
  #include "debug.h"
 +#include "tcg/oversized-guest.h"
  int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch)
  {
+     arch_fpop_start(env);
+-    RxV = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status);
++    RxV = float32_muladd(RsV, RtV, RxV, 0, &env->fp_status);
+     arch_fpop_end(env);
+     return RxV;
+ }
 --
-.34.1
+.43.0

-[PULL 11/52] tcg: Split out tcg-target-reg-bits.h
+[PULL 63/72] target/hexagon: Use float32_muladd for helper_sffms
-Often, the only thing we need to know about the TCG host
+There are no special cases for this instruction.  Since hexagon
-is the register size.
+always uses default-nan mode, explicitly negating the first
 input is unnecessary.  Use float_muladd_negate_product instead.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h                     | 12 +-----------
+ target/hexagon/op_helper.c | 5 ++---
- tcg/aarch64/tcg-target-reg-bits.h     | 12 ++++++++++++
+file changed, 2 insertions(+), 3 deletions(-)
  tcg/arm/tcg-target-reg-bits.h         | 12 ++++++++++++
  tcg/i386/tcg-target-reg-bits.h        | 16 ++++++++++++++++
  tcg/i386/tcg-target.h                 |  2 --
  tcg/loongarch64/tcg-target-reg-bits.h | 21 +++++++++++++++++++++
  tcg/loongarch64/tcg-target.h          | 11 -----------
  tcg/mips/tcg-target-reg-bits.h        | 18 ++++++++++++++++++
  tcg/mips/tcg-target.h                 |  8 --------
  tcg/ppc/tcg-target-reg-bits.h         | 16 ++++++++++++++++
  tcg/ppc/tcg-target.h                  |  5 -----
  tcg/riscv/tcg-target-reg-bits.h       | 19 +++++++++++++++++++
  tcg/riscv/tcg-target.h                |  9 ---------
  tcg/s390x/tcg-target-reg-bits.h       | 17 +++++++++++++++++
  tcg/sparc64/tcg-target-reg-bits.h     | 12 ++++++++++++
  tcg/tci/tcg-target-reg-bits.h         | 18 ++++++++++++++++++
  tcg/tci/tcg-target.h                  |  8 --------
  tcg/s390x/tcg-target.c.inc            |  5 -----
 files changed, 162 insertions(+), 59 deletions(-)
  create mode 100644 tcg/aarch64/tcg-target-reg-bits.h
  create mode 100644 tcg/arm/tcg-target-reg-bits.h
  create mode 100644 tcg/i386/tcg-target-reg-bits.h
  create mode 100644 tcg/loongarch64/tcg-target-reg-bits.h
  create mode 100644 tcg/mips/tcg-target-reg-bits.h
  create mode 100644 tcg/ppc/tcg-target-reg-bits.h
  create mode 100644 tcg/riscv/tcg-target-reg-bits.h
  create mode 100644 tcg/s390x/tcg-target-reg-bits.h
  create mode 100644 tcg/sparc64/tcg-target-reg-bits.h
  create mode 100644 tcg/tci/tcg-target-reg-bits.h
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/target/hexagon/op_helper.c
-+++ b/include/tcg/tcg.h
++++ b/target/hexagon/op_helper.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
- #include "qemu/plugin.h"
+ float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV,
- #include "qemu/queue.h"
+                       float32 RsV, float32 RtV)
- #include "tcg/tcg-mo.h"
+ {
-+#include "tcg-target-reg-bits.h"
+-    float32 neg_RsV;
- #include "tcg-target.h"
+     arch_fpop_start(env);
- #include "tcg/tcg-cond.h"
+-    neg_RsV = float32_set_sign(RsV, float32_is_neg(RsV) ? 0 : 1);
- #include "tcg/debug-assert.h"
+-    RxV = internal_fmafx(neg_RsV, RtV, RxV, 0, &env->fp_status);
-@@ -XXX,XX +XXX,XX @@
++    RxV = float32_muladd(RsV, RtV, RxV, float_muladd_negate_product,
- #define CPU_TEMP_BUF_NLONGS 128
++                         &env->fp_status);
- #define TCG_STATIC_FRAME_SIZE  (CPU_TEMP_BUF_NLONGS * sizeof(long))
+     arch_fpop_end(env);
+     return RxV;
--/* Default target word size to pointer size.  */
+ }
 -#ifndef TCG_TARGET_REG_BITS
 -# if UINTPTR_MAX == UINT32_MAX
 -#  define TCG_TARGET_REG_BITS 32
 -# elif UINTPTR_MAX == UINT64_MAX
 -#  define TCG_TARGET_REG_BITS 64
 -# else
 -#  error Unknown pointer size for tcg target
 -# endif
 -#endif
 -
  #if TCG_TARGET_REG_BITS == 32
  typedef int32_t tcg_target_long;
  typedef uint32_t tcg_target_ulong;
 diff --git a/tcg/aarch64/tcg-target-reg-bits.h b/tcg/aarch64/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/aarch64/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2023 Linaro
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#define TCG_TARGET_REG_BITS  64
 +
 +#endif
 diff --git a/tcg/arm/tcg-target-reg-bits.h b/tcg/arm/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/arm/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2023 Linaro
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#define TCG_TARGET_REG_BITS  32
 +
 +#endif
 diff --git a/tcg/i386/tcg-target-reg-bits.h b/tcg/i386/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/i386/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2008 Fabrice Bellard
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#ifdef __x86_64__
 +# define TCG_TARGET_REG_BITS  64
 +#else
 +# define TCG_TARGET_REG_BITS  32
 +#endif
 +
 +#endif
 diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.h
 +++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_TARGET_INSN_UNIT_SIZE  1
  #ifdef __x86_64__
 -# define TCG_TARGET_REG_BITS  64
  # define TCG_TARGET_NB_REGS   32
  # define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
  #else
 -# define TCG_TARGET_REG_BITS  32
  # define TCG_TARGET_NB_REGS   24
  # define MAX_CODE_GEN_BUFFER_SIZE  UINT32_MAX
  #endif
 diff --git a/tcg/loongarch64/tcg-target-reg-bits.h b/tcg/loongarch64/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/loongarch64/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2021 WANG Xuerui <git@xen0n.name>
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +/*
 + * Loongson removed the (incomplete) 32-bit support from kernel and toolchain
 + * for the initial upstreaming of this architecture, so don't bother and just
 + * support the LP64* ABI for now.
 + */
 +#if defined(__loongarch64)
 +# define TCG_TARGET_REG_BITS 64
 +#else
 +# error unsupported LoongArch register size
 +#endif
 +
 +#endif
 diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/loongarch64/tcg-target.h
 +++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifndef LOONGARCH_TCG_TARGET_H
  #define LOONGARCH_TCG_TARGET_H
 -/*
 - * Loongson removed the (incomplete) 32-bit support from kernel and toolchain
 - * for the initial upstreaming of this architecture, so don't bother and just
 - * support the LP64* ABI for now.
 - */
 -#if defined(__loongarch64)
 -# define TCG_TARGET_REG_BITS 64
 -#else
 -# error unsupported LoongArch register size
 -#endif
 -
  #define TCG_TARGET_INSN_UNIT_SIZE 4
  #define TCG_TARGET_NB_REGS 32
 diff --git a/tcg/mips/tcg-target-reg-bits.h b/tcg/mips/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/mips/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2008-2009 Arnaud Patard <arnaud.patard@rtp-net.org>
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#if _MIPS_SIM == _ABIO32
 +# define TCG_TARGET_REG_BITS 32
 +#elif _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64
 +# define TCG_TARGET_REG_BITS 64
 +#else
 +# error "Unknown ABI"
 +#endif
 +
 +#endif
 diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.h
 +++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifndef MIPS_TCG_TARGET_H
  #define MIPS_TCG_TARGET_H
 -#if _MIPS_SIM == _ABIO32
 -# define TCG_TARGET_REG_BITS 32
 -#elif _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64
 -# define TCG_TARGET_REG_BITS 64
 -#else
 -# error "Unknown ABI"
 -#endif
 -
  #define TCG_TARGET_INSN_UNIT_SIZE 4
  #define TCG_TARGET_NB_REGS 32
 diff --git a/tcg/ppc/tcg-target-reg-bits.h b/tcg/ppc/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/ppc/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2008 Fabrice Bellard
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#ifdef _ARCH_PPC64
 +# define TCG_TARGET_REG_BITS  64
 +#else
 +# define TCG_TARGET_REG_BITS  32
 +#endif
 +
 +#endif
 diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.h
 +++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifndef PPC_TCG_TARGET_H
  #define PPC_TCG_TARGET_H
 -#ifdef _ARCH_PPC64
 -# define TCG_TARGET_REG_BITS  64
 -#else
 -# define TCG_TARGET_REG_BITS  32
 -#endif
  #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
  #define TCG_TARGET_NB_REGS 64
 diff --git a/tcg/riscv/tcg-target-reg-bits.h b/tcg/riscv/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/riscv/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2018 SiFive, Inc
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +/*
 + * We don't support oversize guests.
 + * Since we will only build tcg once, this in turn requires a 64-bit host.
 + */
 +#if __riscv_xlen != 64
 +#error "unsupported code generation mode"
 +#endif
 +#define TCG_TARGET_REG_BITS 64
 +
 +#endif
 diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.h
 +++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #ifndef RISCV_TCG_TARGET_H
  #define RISCV_TCG_TARGET_H
 -/*
 - * We don't support oversize guests.
 - * Since we will only build tcg once, this in turn requires a 64-bit host.
 - */
 -#if __riscv_xlen != 64
 -#error "unsupported code generation mode"
 -#endif
 -#define TCG_TARGET_REG_BITS 64
 -
  #define TCG_TARGET_INSN_UNIT_SIZE 4
  #define TCG_TARGET_NB_REGS 32
  #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 diff --git a/tcg/s390x/tcg-target-reg-bits.h b/tcg/s390x/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/s390x/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2009 Ulrich Hecht <uli@suse.de>
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +/* We only support generating code for 64-bit mode.  */
 +#if UINTPTR_MAX == UINT64_MAX
 +# define TCG_TARGET_REG_BITS 64
 +#else
 +# error "unsupported code generation mode"
 +#endif
 +
 +#endif
 diff --git a/tcg/sparc64/tcg-target-reg-bits.h b/tcg/sparc64/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/sparc64/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2023 Linaro
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#define TCG_TARGET_REG_BITS  64
 +
 +#endif
 diff --git a/tcg/tci/tcg-target-reg-bits.h b/tcg/tci/tcg-target-reg-bits.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/tci/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: MIT */
 +/*
 + * Define target-specific register size
 + * Copyright (c) 2009, 2011 Stefan Weil
 + */
 +
 +#ifndef TCG_TARGET_REG_BITS_H
 +#define TCG_TARGET_REG_BITS_H
 +
 +#if UINTPTR_MAX == UINT32_MAX
 +# define TCG_TARGET_REG_BITS 32
 +#elif UINTPTR_MAX == UINT64_MAX
 +# define TCG_TARGET_REG_BITS 64
 +#else
 +# error Unknown pointer size for tci target
 +#endif
 +
 +#endif
 diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.h
 +++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
  #define TCG_TARGET_INSN_UNIT_SIZE 4
  #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 -#if UINTPTR_MAX == UINT32_MAX
 -# define TCG_TARGET_REG_BITS 32
 -#elif UINTPTR_MAX == UINT64_MAX
 -# define TCG_TARGET_REG_BITS 64
 -#else
 -# error Unknown pointer size for tci target
 -#endif
 -
  /* Optional instructions. */
  #define TCG_TARGET_HAS_bswap16_i32      1
 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390x/tcg-target.c.inc
 +++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 -/* We only support generating code for 64-bit mode.  */
 -#if TCG_TARGET_REG_BITS != 64
 -#error "unsupported code generation mode"
 -#endif
 -
  #include "../tcg-ldst.c.inc"
  #include "../tcg-pool.c.inc"
  #include "elf.h"
 --
-.34.1
+.43.0

-[PULL 10/52] *: Add missing includes of tcg/tcg.h
+[PULL 64/72] target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
-This had been pulled in from exec/cpu_ldst.h, via exec/exec-all.h,
+This instruction has a special case that 0 * x + c returns c
-but the include of tcg.h will be removed.
+without the normal sign folding that comes with 0 + -0.
 Use the new float_muladd_suppress_add_product_zero to
 describe this.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/monitor.c             | 1 +
+ target/hexagon/op_helper.c | 11 +++--------
- accel/tcg/tcg-accel-ops-mttcg.c | 2 +-
+file changed, 3 insertions(+), 8 deletions(-)
  accel/tcg/tcg-accel-ops-rr.c    | 2 +-
  target/i386/helper.c            | 3 +++
  target/openrisc/sys_helper.c    | 1 +
 files changed, 7 insertions(+), 2 deletions(-)
-diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/monitor.c
+--- a/target/hexagon/op_helper.c
-+++ b/accel/tcg/monitor.c
++++ b/target/hexagon/op_helper.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
- #include "sysemu/cpus.h"
+ float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
- #include "sysemu/cpu-timers.h"
+                          float32 RsV, float32 RtV, float32 PuV)
  #include "sysemu/tcg.h"
 +#include "tcg/tcg.h"
  #include "internal.h"
 diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops-mttcg.c
 +++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/guest-random.h"
  #include "exec/exec-all.h"
  #include "hw/boards.h"
 -
 +#include "tcg/tcg.h"
  #include "tcg-accel-ops.h"
  #include "tcg-accel-ops-mttcg.h"
 diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops-rr.c
 +++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@
  #include "qemu/notify.h"
  #include "qemu/guest-random.h"
  #include "exec/exec-all.h"
 -
 +#include "tcg/tcg.h"
  #include "tcg-accel-ops.h"
  #include "tcg-accel-ops-rr.h"
  #include "tcg-accel-ops-icount.h"
 diff --git a/target/i386/helper.c b/target/i386/helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/helper.c
 +++ b/target/i386/helper.c
@@ -XXX,XX +XXX,XX @@
  #include "monitor/monitor.h"
  #endif
  #include "qemu/log.h"
 +#ifdef CONFIG_TCG
 +#include "tcg/tcg.h"
 +#endif
  void cpu_sync_avx_hflag(CPUX86State *env)
  {
-diff --git a/target/openrisc/sys_helper.c b/target/openrisc/sys_helper.c
+-    size4s_t tmp;
-index XXXXXXX..XXXXXXX 100644
+     arch_fpop_start(env);
---- a/target/openrisc/sys_helper.c
+-    RxV = check_nan(RxV, RxV, &env->fp_status);
-+++ b/target/openrisc/sys_helper.c
+-    RxV = check_nan(RxV, RsV, &env->fp_status);
-@@ -XXX,XX +XXX,XX @@
+-    RxV = check_nan(RxV, RtV, &env->fp_status);
- #ifndef CONFIG_USER_ONLY
+-    tmp = internal_fmafx(RsV, RtV, RxV, fSXTN(8, 64, PuV), &env->fp_status);
- #include "hw/boards.h"
+-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
- #endif
+-        RxV = tmp;
-+#include "tcg/tcg.h"
+-    }
++    RxV = float32_muladd_scalbn(RsV, RtV, RxV, fSXTN(8, 64, PuV),
- #define TO_SPR(group, number) (((group) << 11) + (number))
++                                float_muladd_suppress_add_product_zero,
++                                &env->fp_status);
      arch_fpop_end(env);
      return RxV;
  }
 --
-.34.1
+.43.0

-[PULL 37/52] target/ppc: Inline gen_icount_io_start()
+[PULL 65/72] target/hexagon: Use float32_muladd for helper_sffm[as]_lib
-From: Philippe Mathieu-Daudé <philmd@linaro.org>
+There are multiple special cases for this instruction.
 (1) The saturate to normal maximum instead of overflow to infinity is
     handled by the new float_round_nearest_even_max rounding mode.
 (2) The 0 * n + c special case is handled by the new
     float_muladd_suppress_add_product_zero flag.
 (3) The Inf - Inf -> 0 special case can be detected after the fact
     by examining float_flag_invalid_isi.
-Now that gen_icount_io_start() is a simple wrapper to
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 translator_io_start(), inline it.
 Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Message-Id: <20230602095439.48102-1-philmd@linaro.org>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/translate.c                 | 63 ++++++++++++--------------
+ target/hexagon/op_helper.c | 105 +++++++++----------------------------
- target/ppc/power8-pmu-regs.c.inc       | 10 ++--
+file changed, 26 insertions(+), 79 deletions(-)
  target/ppc/translate/branch-impl.c.inc |  2 +-
 files changed, 35 insertions(+), 40 deletions(-)
-diff --git a/target/ppc/translate.c b/target/ppc/translate.c
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/translate.c
+--- a/target/hexagon/op_helper.c
-+++ b/target/ppc/translate.c
++++ b/target/hexagon/op_helper.c
-@@ -XXX,XX +XXX,XX @@ static void gen_exception_nip(DisasContext *ctx, uint32_t excp,
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV,
-     ctx->base.is_jmp = DISAS_NORETURN;
+     return RxV;
  }
--static void gen_icount_io_start(DisasContext *ctx)
+-static bool is_zero_prod(float32 a, float32 b)
 -{
--    translator_io_start(&ctx->base);
+-    return ((float32_is_zero(a) && is_finite(b)) ||
 -            (float32_is_zero(b) && is_finite(a)));
 -}
 -
- #if !defined(CONFIG_USER_ONLY)
+-static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
- static void gen_ppc_maybe_interrupt(DisasContext *ctx)
+-{
 -    float32 ret = dst;
 -    if (float32_is_any_nan(x)) {
 -        if (extract32(x, 22, 1) == 0) {
 -            float_raise(float_flag_invalid, fp_status);
 -        }
 -        ret = make_float32(0xffffffff);    /* nan */
 -    }
 -    return ret;
 -}
 -
  float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
                           float32 RsV, float32 RtV, float32 PuV)
  {
--    gen_icount_io_start(ctx);
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV,
-+    translator_io_start(&ctx->base);
+     return RxV;
      gen_helper_ppc_maybe_interrupt(cpu_env);
  }
- #endif
-@@ -XXX,XX +XXX,XX @@ void spr_write_ureg(DisasContext *ctx, int sprn, int gprn)
+-static bool is_inf_prod(int32_t a, int32_t b)
- #if !defined(CONFIG_USER_ONLY)
++static float32 do_sffma_lib(CPUHexagonState *env, float32 RxV,
- void spr_read_decr(DisasContext *ctx, int gprn, int sprn)
++                            float32 RsV, float32 RtV, int negate)
  {
--    gen_icount_io_start(ctx);
+-    return (float32_is_infinity(a) && float32_is_infinity(b)) ||
-+    translator_io_start(&ctx->base);
+-           (float32_is_infinity(a) && is_finite(b) && !float32_is_zero(b)) ||
-     gen_helper_load_decr(cpu_gpr[gprn], cpu_env);
+-           (float32_is_infinity(b) && is_finite(a) && !float32_is_zero(a));
 +    int flags;
 +
 +    arch_fpop_start(env);
 +
 +    set_float_rounding_mode(float_round_nearest_even_max, &env->fp_status);
 +    RxV = float32_muladd(RsV, RtV, RxV,
 +                         negate | float_muladd_suppress_add_product_zero,
 +                         &env->fp_status);
 +
 +    flags = get_float_exception_flags(&env->fp_status);
 +    if (flags) {
 +        /* Flags are suppressed by this instruction. */
 +        set_float_exception_flags(0, &env->fp_status);
 +
 +        /* Return 0 for Inf - Inf. */
 +        if (flags & float_flag_invalid_isi) {
 +            RxV = 0;
 +        }
 +    }
 +
 +    arch_fpop_end(env);
 +    return RxV;
  }
- void spr_write_decr(DisasContext *ctx, int sprn, int gprn)
+ float32 HELPER(sffma_lib)(CPUHexagonState *env, float32 RxV,
                            float32 RsV, float32 RtV)
  {
--    gen_icount_io_start(ctx);
+-    bool infinp;
-+    translator_io_start(&ctx->base);
+-    bool infminusinf;
-     gen_helper_store_decr(cpu_env, cpu_gpr[gprn]);
+-    float32 tmp;
 -
 -    arch_fpop_start(env);
 -    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
 -    infminusinf = float32_is_infinity(RxV) &&
 -                  is_inf_prod(RsV, RtV) &&
 -                  (fGETBIT(31, RsV ^ RxV ^ RtV) != 0);
 -    infinp = float32_is_infinity(RxV) ||
 -             float32_is_infinity(RtV) ||
 -             float32_is_infinity(RsV);
 -    RxV = check_nan(RxV, RxV, &env->fp_status);
 -    RxV = check_nan(RxV, RsV, &env->fp_status);
 -    RxV = check_nan(RxV, RtV, &env->fp_status);
 -    tmp = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status);
 -    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
 -        RxV = tmp;
 -    }
 -    set_float_exception_flags(0, &env->fp_status);
 -    if (float32_is_infinity(RxV) && !infinp) {
 -        RxV = RxV - 1;
 -    }
 -    if (infminusinf) {
 -        RxV = 0;
 -    }
 -    arch_fpop_end(env);
 -    return RxV;
 +    return do_sffma_lib(env, RxV, RsV, RtV, 0);
  }
- #endif
-@@ -XXX,XX +XXX,XX @@ void spr_write_decr(DisasContext *ctx, int sprn, int gprn)
+ float32 HELPER(sffms_lib)(CPUHexagonState *env, float32 RxV,
- /* Time base */
+                           float32 RsV, float32 RtV)
  void spr_read_tbl(DisasContext *ctx, int gprn, int sprn)
  {
--    gen_icount_io_start(ctx);
+-    bool infinp;
-+    translator_io_start(&ctx->base);
+-    bool infminusinf;
-     gen_helper_load_tbl(cpu_gpr[gprn], cpu_env);
+-    float32 tmp;
 -
 -    arch_fpop_start(env);
 -    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
 -    infminusinf = float32_is_infinity(RxV) &&
 -                  is_inf_prod(RsV, RtV) &&
 -                  (fGETBIT(31, RsV ^ RxV ^ RtV) == 0);
 -    infinp = float32_is_infinity(RxV) ||
 -             float32_is_infinity(RtV) ||
 -             float32_is_infinity(RsV);
 -    RxV = check_nan(RxV, RxV, &env->fp_status);
 -    RxV = check_nan(RxV, RsV, &env->fp_status);
 -    RxV = check_nan(RxV, RtV, &env->fp_status);
 -    float32 minus_RsV = float32_sub(float32_zero, RsV, &env->fp_status);
 -    tmp = internal_fmafx(minus_RsV, RtV, RxV, 0, &env->fp_status);
 -    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
 -        RxV = tmp;
 -    }
 -    set_float_exception_flags(0, &env->fp_status);
 -    if (float32_is_infinity(RxV) && !infinp) {
 -        RxV = RxV - 1;
 -    }
 -    if (infminusinf) {
 -        RxV = 0;
 -    }
 -    arch_fpop_end(env);
 -    return RxV;
 +    return do_sffma_lib(env, RxV, RsV, RtV, float_muladd_negate_product);
  }
- void spr_read_tbu(DisasContext *ctx, int gprn, int sprn)
+ float64 HELPER(dfmpyfix)(CPUHexagonState *env, float64 RssV, float64 RttV)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_load_tbu(cpu_gpr[gprn], cpu_env);
  }
@@ -XXX,XX +XXX,XX @@ void spr_read_atbu(DisasContext *ctx, int gprn, int sprn)
  #if !defined(CONFIG_USER_ONLY)
  void spr_write_tbl(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_tbl(cpu_env, cpu_gpr[gprn]);
  }
  void spr_write_tbu(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_tbu(cpu_env, cpu_gpr[gprn]);
  }
@@ -XXX,XX +XXX,XX @@ void spr_write_atbu(DisasContext *ctx, int sprn, int gprn)
  #if defined(TARGET_PPC64)
  void spr_read_purr(DisasContext *ctx, int gprn, int sprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_load_purr(cpu_gpr[gprn], cpu_env);
  }
  void spr_write_purr(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_purr(cpu_env, cpu_gpr[gprn]);
  }
  /* HDECR */
  void spr_read_hdecr(DisasContext *ctx, int gprn, int sprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_load_hdecr(cpu_gpr[gprn], cpu_env);
  }
  void spr_write_hdecr(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_hdecr(cpu_env, cpu_gpr[gprn]);
  }
  void spr_read_vtb(DisasContext *ctx, int gprn, int sprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_load_vtb(cpu_gpr[gprn], cpu_env);
  }
  void spr_write_vtb(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_vtb(cpu_env, cpu_gpr[gprn]);
  }
  void spr_write_tbu40(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_tbu40(cpu_env, cpu_gpr[gprn]);
  }
@@ -XXX,XX +XXX,XX @@ void spr_write_dpdes(DisasContext *ctx, int sprn, int gprn)
  #if !defined(CONFIG_USER_ONLY)
  void spr_read_40x_pit(DisasContext *ctx, int gprn, int sprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_load_40x_pit(cpu_gpr[gprn], cpu_env);
  }
  void spr_write_40x_pit(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_40x_pit(cpu_env, cpu_gpr[gprn]);
  }
  void spr_write_40x_dbcr0(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_store_spr(sprn, cpu_gpr[gprn]);
      gen_helper_store_40x_dbcr0(cpu_env, cpu_gpr[gprn]);
      /* We must stop translation as we may have rebooted */
@@ -XXX,XX +XXX,XX @@ void spr_write_40x_dbcr0(DisasContext *ctx, int sprn, int gprn)
  void spr_write_40x_sler(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_40x_sler(cpu_env, cpu_gpr[gprn]);
  }
  void spr_write_40x_tcr(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_40x_tcr(cpu_env, cpu_gpr[gprn]);
  }
  void spr_write_40x_tsr(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_40x_tsr(cpu_env, cpu_gpr[gprn]);
  }
@@ -XXX,XX +XXX,XX @@ void spr_write_40x_pid(DisasContext *ctx, int sprn, int gprn)
  void spr_write_booke_tcr(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_booke_tcr(cpu_env, cpu_gpr[gprn]);
  }
  void spr_write_booke_tsr(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_booke_tsr(cpu_env, cpu_gpr[gprn]);
  }
  #endif
@@ -XXX,XX +XXX,XX @@ static void gen_darn(DisasContext *ctx)
      if (l > 2) {
          tcg_gen_movi_i64(cpu_gpr[rD(ctx->opcode)], -1);
      } else {
 -        gen_icount_io_start(ctx);
 +        translator_io_start(&ctx->base);
          if (l == 0) {
              gen_helper_darn32(cpu_gpr[rD(ctx->opcode)]);
          } else {
@@ -XXX,XX +XXX,XX @@ static void pmu_count_insns(DisasContext *ctx)
       * running with icount and we do not handle it beforehand,
       * the helper can trigger a 'bad icount read'.
       */
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      /* Avoid helper calls when only PMC5-6 are enabled. */
      if (!ctx->pmc_other) {
@@ -XXX,XX +XXX,XX @@ static void gen_rfi(DisasContext *ctx)
      }
      /* Restore CPU state */
      CHK_SV(ctx);
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_update_cfar(ctx, ctx->cia);
      gen_helper_rfi(cpu_env);
      ctx->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void gen_rfid(DisasContext *ctx)
  #else
      /* Restore CPU state */
      CHK_SV(ctx);
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_update_cfar(ctx, ctx->cia);
      gen_helper_rfid(cpu_env);
      ctx->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void gen_rfscv(DisasContext *ctx)
  #else
      /* Restore CPU state */
      CHK_SV(ctx);
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_update_cfar(ctx, ctx->cia);
      gen_helper_rfscv(cpu_env);
      ctx->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void gen_mtmsrd(DisasContext *ctx)
      t0 = tcg_temp_new();
      t1 = tcg_temp_new();
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      if (ctx->opcode & 0x00010000) {
          /* L=1 form only updates EE and RI */
@@ -XXX,XX +XXX,XX @@ static void gen_mtmsr(DisasContext *ctx)
      t0 = tcg_temp_new();
      t1 = tcg_temp_new();
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      if (ctx->opcode & 0x00010000) {
          /* L=1 form only updates EE and RI */
          mask &= (1ULL << MSR_RI) | (1ULL << MSR_EE);
 diff --git a/target/ppc/power8-pmu-regs.c.inc b/target/ppc/power8-pmu-regs.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/power8-pmu-regs.c.inc
 +++ b/target/ppc/power8-pmu-regs.c.inc
@@ -XXX,XX +XXX,XX @@ static void write_MMCR0_common(DisasContext *ctx, TCGv val)
      /*
       * helper_store_mmcr0 will make clock based operations that
       * will cause 'bad icount read' errors if we do not execute
 -     * gen_icount_io_start() beforehand.
 +     * translator_io_start() beforehand.
       */
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_mmcr0(cpu_env, val);
      /*
@@ -XXX,XX +XXX,XX @@ void spr_read_PMC(DisasContext *ctx, int gprn, int sprn)
  {
      TCGv_i32 t_sprn = tcg_constant_i32(sprn);
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_read_pmc(cpu_gpr[gprn], cpu_env, t_sprn);
  }
@@ -XXX,XX +XXX,XX @@ void spr_write_PMC(DisasContext *ctx, int sprn, int gprn)
  {
      TCGv_i32 t_sprn = tcg_constant_i32(sprn);
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_pmc(cpu_env, t_sprn, cpu_gpr[gprn]);
  }
@@ -XXX,XX +XXX,XX @@ void spr_write_MMCR0(DisasContext *ctx, int sprn, int gprn)
  void spr_write_MMCR1(DisasContext *ctx, int sprn, int gprn)
  {
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_helper_store_mmcr1(cpu_env, cpu_gpr[gprn]);
  }
  #else
 diff --git a/target/ppc/translate/branch-impl.c.inc b/target/ppc/translate/branch-impl.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate/branch-impl.c.inc
 +++ b/target/ppc/translate/branch-impl.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_RFEBB(DisasContext *ctx, arg_XL_s *arg)
  {
      REQUIRE_INSNS_FLAGS2(ctx, ISA207S);
 -    gen_icount_io_start(ctx);
 +    translator_io_start(&ctx->base);
      gen_update_cfar(ctx, ctx->cia);
      gen_helper_rfebb(cpu_env, cpu_gpr[arg->s]);
 --
-.34.1
+.43.0

-[PULL 09/52] target/*: Add missing includes of tcg/debug-assert.h
+[PULL 66/72] target/hexagon: Remove internal_fmafx
-This had been pulled in from tcg/tcg.h, via exec/cpu_ldst.h,
+The function is now unused.
 via exec/exec-all.h, but the include of tcg.h will be removed.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/avr/cpu.c      | 1 +
+ target/hexagon/fma_emu.h |   2 -
- target/rx/cpu.c       | 1 +
+ target/hexagon/fma_emu.c | 171 ---------------------------------------
- target/rx/op_helper.c | 1 +
+files changed, 173 deletions(-)
  target/tricore/cpu.c  | 1 +
 files changed, 4 insertions(+)
-diff --git a/target/avr/cpu.c b/target/avr/cpu.c
+diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/avr/cpu.c
+--- a/target/hexagon/fma_emu.h
-+++ b/target/avr/cpu.c
++++ b/target/hexagon/fma_emu.h
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t float32_getexp_raw(float32 f32)
- #include "exec/exec-all.h"
+ }
- #include "cpu.h"
+ int32_t float32_getexp(float32 f32);
- #include "disas/dis-asm.h"
+ float32 infinite_float32(uint8_t sign);
-+#include "tcg/debug-assert.h"
+-float32 internal_fmafx(float32 a, float32 b, float32 c,
+-                       int scale, float_status *fp_status);
- static void avr_cpu_set_pc(CPUState *cs, vaddr value)
+ float64 internal_mpyhh(float64 a, float64 b,
                         unsigned long long int accumulated,
                         float_status *fp_status);
 diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/fma_emu.c
 +++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
      return -1;
  }
 -static uint64_t float32_getmant(float32 f32)
 -{
 -    Float a = { .i = f32 };
 -    if (float32_is_normal(f32)) {
 -        return a.mant | 1ULL << 23;
 -    }
 -    if (float32_is_zero(f32)) {
 -        return 0;
 -    }
 -    if (float32_is_denormal(f32)) {
 -        return a.mant;
 -    }
 -    return ~0ULL;
 -}
 -
  int32_t float32_getexp(float32 f32)
  {
-diff --git a/target/rx/cpu.c b/target/rx/cpu.c
+     Float a = { .i = f32 };
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
---- a/target/rx/cpu.c
+ }
-+++ b/target/rx/cpu.c
-@@ -XXX,XX +XXX,XX @@
+ /* Return a maximum finite value with the requested sign */
- #include "exec/exec-all.h"
+-static float32 maxfinite_float32(uint8_t sign)
- #include "hw/loader.h"
+-{
- #include "fpu/softfloat.h"
+-    if (sign) {
-+#include "tcg/debug-assert.h"
+-        return make_float32(SF_MINUS_MAXF);
+-    } else {
- static void rx_cpu_set_pc(CPUState *cs, vaddr value)
+-        return make_float32(SF_MAXF);
- {
+-    }
-diff --git a/target/rx/op_helper.c b/target/rx/op_helper.c
+-}
-index XXXXXXX..XXXXXXX 100644
+-
---- a/target/rx/op_helper.c
+-/* Return a zero value with requested sign */
-+++ b/target/rx/op_helper.c
+-static float32 zero_float32(uint8_t sign)
-@@ -XXX,XX +XXX,XX @@
+-{
- #include "exec/helper-proto.h"
+-    if (sign) {
- #include "exec/cpu_ldst.h"
+-        return make_float32(0x80000000);
- #include "fpu/softfloat.h"
+-    } else {
-+#include "tcg/debug-assert.h"
+-        return float32_zero;
+-    }
- static inline G_NORETURN
+-}
- void raise_exception(CPURXState *env, int index,
+-
-diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
+ #define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
-index XXXXXXX..XXXXXXX 100644
+ static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
---- a/target/tricore/cpu.c
+ { \
-+++ b/target/tricore/cpu.c
+@@ -XXX,XX +XXX,XX @@ static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
-@@ -XXX,XX +XXX,XX @@
+ }
- #include "cpu.h"
- #include "exec/exec-all.h"
+ GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
- #include "qemu/error-report.h"
+-GEN_XF_ROUND(float32, SF_MANTBITS, SF_INF_EXP, Float)
-+#include "tcg/debug-assert.h"
+-
+-static bool is_inf_prod(float64 a, float64 b)
- static inline void set_feature(CPUTriCoreState *env, int feature)
+-{
- {
+-    return ((float64_is_infinity(a) && float64_is_infinity(b)) ||
 -            (float64_is_infinity(a) && is_finite(b) && (!float64_is_zero(b))) ||
 -            (float64_is_infinity(b) && is_finite(a) && (!float64_is_zero(a))));
 -}
 -
 -static float64 special_fma(float64 a, float64 b, float64 c,
 -                           float_status *fp_status)
 -{
 -    float64 ret = make_float64(0);
 -
 -    /*
 -     * If A multiplied by B is an exact infinity and C is also an infinity
 -     * but with the opposite sign, FMA returns NaN and raises invalid.
 -     */
 -    uint8_t a_sign = float64_is_neg(a);
 -    uint8_t b_sign = float64_is_neg(b);
 -    uint8_t c_sign = float64_is_neg(c);
 -    if (is_inf_prod(a, b) && float64_is_infinity(c)) {
 -        if ((a_sign ^ b_sign) != c_sign) {
 -            ret = make_float64(DF_NAN);
 -            float_raise(float_flag_invalid, fp_status);
 -            return ret;
 -        }
 -    }
 -    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
 -        (float64_is_zero(a) && float64_is_infinity(b))) {
 -        ret = make_float64(DF_NAN);
 -        float_raise(float_flag_invalid, fp_status);
 -        return ret;
 -    }
 -    /*
 -     * If none of the above checks are true and C is a NaN,
 -     * a NaN shall be returned
 -     * If A or B are NaN, a NAN shall be returned.
 -     */
 -    if (float64_is_any_nan(a) ||
 -        float64_is_any_nan(b) ||
 -        float64_is_any_nan(c)) {
 -        if (float64_is_any_nan(a) && (fGETBIT(51, a) == 0)) {
 -            float_raise(float_flag_invalid, fp_status);
 -        }
 -        if (float64_is_any_nan(b) && (fGETBIT(51, b) == 0)) {
 -            float_raise(float_flag_invalid, fp_status);
 -        }
 -        if (float64_is_any_nan(c) && (fGETBIT(51, c) == 0)) {
 -            float_raise(float_flag_invalid, fp_status);
 -        }
 -        ret = make_float64(DF_NAN);
 -        return ret;
 -    }
 -    /*
 -     * We have checked for adding opposite-signed infinities.
 -     * Other infinities return infinity with the correct sign
 -     */
 -    if (float64_is_infinity(c)) {
 -        ret = infinite_float64(c_sign);
 -        return ret;
 -    }
 -    if (float64_is_infinity(a) || float64_is_infinity(b)) {
 -        ret = infinite_float64(a_sign ^ b_sign);
 -        return ret;
 -    }
 -    g_assert_not_reached();
 -}
 -
 -static float32 special_fmaf(float32 a, float32 b, float32 c,
 -                            float_status *fp_status)
 -{
 -    float64 aa, bb, cc;
 -    aa = float32_to_float64(a, fp_status);
 -    bb = float32_to_float64(b, fp_status);
 -    cc = float32_to_float64(c, fp_status);
 -    return float64_to_float32(special_fma(aa, bb, cc, fp_status), fp_status);
 -}
 -
 -float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
 -                       float_status *fp_status)
 -{
 -    Accum prod;
 -    Accum acc;
 -    Accum result;
 -    accum_init(&prod);
 -    accum_init(&acc);
 -    accum_init(&result);
 -
 -    uint8_t a_sign = float32_is_neg(a);
 -    uint8_t b_sign = float32_is_neg(b);
 -    uint8_t c_sign = float32_is_neg(c);
 -    if (float32_is_infinity(a) ||
 -        float32_is_infinity(b) ||
 -        float32_is_infinity(c)) {
 -        return special_fmaf(a, b, c, fp_status);
 -    }
 -    if (float32_is_any_nan(a) ||
 -        float32_is_any_nan(b) ||
 -        float32_is_any_nan(c)) {
 -        return special_fmaf(a, b, c, fp_status);
 -    }
 -    if ((scale == 0) && (float32_is_zero(a) || float32_is_zero(b))) {
 -        float32 tmp = float32_mul(a, b, fp_status);
 -        tmp = float32_add(tmp, c, fp_status);
 -        return tmp;
 -    }
 -
 -    /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
 -    prod.mant = int128_mul_6464(float32_getmant(a), float32_getmant(b));
 -
 -    /*
 -     * Note: extracting the mantissa into an int is multiplying by
 -     * 2**23, so adjust here
 -     */
 -    prod.exp = float32_getexp(a) + float32_getexp(b) - SF_BIAS - 23;
 -    prod.sign = a_sign ^ b_sign;
 -    if (float32_is_zero(a) || float32_is_zero(b)) {
 -        prod.exp = -2 * WAY_BIG_EXP;
 -    }
 -    if ((scale > 0) && float32_is_denormal(c)) {
 -        acc.mant = int128_mul_6464(0, 0);
 -        acc.exp = -WAY_BIG_EXP;
 -        acc.sign = c_sign;
 -        acc.sticky = 1;
 -        result = accum_add(prod, acc);
 -    } else if (!float32_is_zero(c)) {
 -        acc.mant = int128_mul_6464(float32_getmant(c), 1);
 -        acc.exp = float32_getexp(c);
 -        acc.sign = c_sign;
 -        result = accum_add(prod, acc);
 -    } else {
 -        result = prod;
 -    }
 -    result.exp += scale;
 -    return accum_round_float32(result, fp_status);
 -}
  float64 internal_mpyhh(float64 a, float64 b,
                        unsigned long long int accumulated,
 --
-.34.1
+.43.0

-[PULL 12/52] target/arm: Fix test of TCG_OVERSIZED_GUEST
+[PULL 67/72] target/hexagon: Expand GEN_XF_ROUND
-The symbol is always defined, even if to 0.  We wanted to test for
+This massive macro is now only used once.
-TCG_OVERSIZED_GUEST == 0.
+Expand it for use only by float64.
-This fixed, the #error is reached while building arm-softmmu, because
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 TCG_OVERSIZED_GUEST is not true (nor supposed to be true) for arm32
 guest on a 32-bit host.  But that's ok, because this feature doesn't
 apply to arm32.  Add an #ifdef for TARGET_AARCH64.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/ptw.c | 7 ++++++-
+ target/hexagon/fma_emu.c | 255 +++++++++++++++++++--------------------
-file changed, 6 insertions(+), 1 deletion(-)
+file changed, 127 insertions(+), 128 deletions(-)
-diff --git a/target/arm/ptw.c b/target/arm/ptw.c
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/ptw.c
+--- a/target/hexagon/fma_emu.c
-+++ b/target/arm/ptw.c
++++ b/target/hexagon/fma_emu.c
-@@ -XXX,XX +XXX,XX @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val,
+@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
                               uint64_t new_val, S1Translate *ptw,
                               ARMMMUFaultInfo *fi)
  {
 +#ifdef TARGET_AARCH64
      uint64_t cur_val;
      void *host = ptw->out_host;
@@ -XXX,XX +XXX,XX @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val,
       * we know that TCG_OVERSIZED_GUEST is set, which means that we are
       * running in round-robin mode and could only race with dma i/o.
       */
 -#ifndef TCG_OVERSIZED_GUEST
 +#if !TCG_OVERSIZED_GUEST
  # error "Unexpected configuration"
  #endif
      bool locked = qemu_mutex_iothread_locked();
@@ -XXX,XX +XXX,XX @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val,
  #endif
      return cur_val;
 +#else
 +    /* AArch32 does not have FEAT_HADFS. */
 +    g_assert_not_reached();
 +#endif
  }
- static bool get_level1_table_address(CPUARMState *env, ARMMMUIdx mmu_idx,
+ /* Return a maximum finite value with the requested sign */
 -#define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
 -static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
 -{ \
 -    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0) \
 -        && ((a.guard | a.round | a.sticky) == 0)) { \
 -        /* result zero */ \
 -        switch (fp_status->float_rounding_mode) { \
 -        case float_round_down: \
 -            return zero_##SUFFIX(1); \
 -        default: \
 -            return zero_##SUFFIX(0); \
 -        } \
 -    } \
 -    /* Normalize right */ \
 -    /* We want MANTBITS bits of mantissa plus the leading one. */ \
 -    /* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
 -    /* So we need to normalize right while the high word is non-zero and \
 -    * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
 -    while ((int128_gethi(a.mant) != 0) || \
 -           ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0)) { \
 -        a = accum_norm_right(a, 1); \
 -    } \
 -    /* \
 -     * OK, now normalize left \
 -     * We want to normalize left until we have a leading one in bit 24 \
 -     * Theoretically, we only need to shift a maximum of one to the left if we \
 -     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky \
 -     * should be 0  \
 -     */ \
 -    while ((int128_getlo(a.mant) & (1ULL << MANTBITS)) == 0) { \
 -        a = accum_norm_left(a); \
 -    } \
 -    /* \
 -     * OK, now we might need to denormalize because of potential underflow. \
 -     * We need to do this before rounding, and rounding might make us normal \
 -     * again \
 -     */ \
 -    while (a.exp <= 0) { \
 -        a = accum_norm_right(a, 1 - a.exp); \
 -        /* \
 -         * Do we have underflow? \
 -         * That's when we get an inexact answer because we ran out of bits \
 -         * in a denormal. \
 -         */ \
 -        if (a.guard || a.round || a.sticky) { \
 -            float_raise(float_flag_underflow, fp_status); \
 -        } \
 -    } \
 -    /* OK, we're relatively canonical... now we need to round */ \
 -    if (a.guard || a.round || a.sticky) { \
 -        float_raise(float_flag_inexact, fp_status); \
 -        switch (fp_status->float_rounding_mode) { \
 -        case float_round_to_zero: \
 -            /* Chop and we're done */ \
 -            break; \
 -        case float_round_up: \
 -            if (a.sign == 0) { \
 -                a.mant = int128_add(a.mant, int128_one()); \
 -            } \
 -            break; \
 -        case float_round_down: \
 -            if (a.sign != 0) { \
 -                a.mant = int128_add(a.mant, int128_one()); \
 -            } \
 -            break; \
 -        default: \
 -            if (a.round || a.sticky) { \
 -                /* round up if guard is 1, down if guard is zero */ \
 -                a.mant = int128_add(a.mant, int128_make64(a.guard)); \
 -            } else if (a.guard) { \
 -                /* exactly .5, round up if odd */ \
 -                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one())); \
 -            } \
 -            break; \
 -        } \
 -    } \
 -    /* \
 -     * OK, now we might have carried all the way up. \
 -     * So we might need to shr once \
 -     * at least we know that the lsb should be zero if we rounded and \
 -     * got a carry out... \
 -     */ \
 -    if ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0) { \
 -        a = accum_norm_right(a, 1); \
 -    } \
 -    /* Overflow? */ \
 -    if (a.exp >= INF_EXP) { \
 -        /* Yep, inf result */ \
 -        float_raise(float_flag_overflow, fp_status); \
 -        float_raise(float_flag_inexact, fp_status); \
 -        switch (fp_status->float_rounding_mode) { \
 -        case float_round_to_zero: \
 -            return maxfinite_##SUFFIX(a.sign); \
 -        case float_round_up: \
 -            if (a.sign == 0) { \
 -                return infinite_##SUFFIX(a.sign); \
 -            } else { \
 -                return maxfinite_##SUFFIX(a.sign); \
 -            } \
 -        case float_round_down: \
 -            if (a.sign != 0) { \
 -                return infinite_##SUFFIX(a.sign); \
 -            } else { \
 -                return maxfinite_##SUFFIX(a.sign); \
 -            } \
 -        default: \
 -            return infinite_##SUFFIX(a.sign); \
 -        } \
 -    } \
 -    /* Underflow? */ \
 -    if (int128_getlo(a.mant) & (1ULL << MANTBITS)) { \
 -        /* Leading one means: No, we're normal. So, we should be done... */ \
 -        INTERNAL_TYPE ret; \
 -        ret.i = 0; \
 -        ret.sign = a.sign; \
 -        ret.exp = a.exp; \
 -        ret.mant = int128_getlo(a.mant); \
 -        return ret.i; \
 -    } \
 -    assert(a.exp == 1); \
 -    INTERNAL_TYPE ret; \
 -    ret.i = 0; \
 -    ret.sign = a.sign; \
 -    ret.exp = 0; \
 -    ret.mant = int128_getlo(a.mant); \
 -    return ret.i; \
 +static float64 accum_round_float64(Accum a, float_status *fp_status)
 +{
 +    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
 +        && ((a.guard | a.round | a.sticky) == 0)) {
 +        /* result zero */
 +        switch (fp_status->float_rounding_mode) {
 +        case float_round_down:
 +            return zero_float64(1);
 +        default:
 +            return zero_float64(0);
 +        }
 +    }
 +    /*
 +     * Normalize right
 +     * We want DF_MANTBITS bits of mantissa plus the leading one.
 +     * That means that we want DF_MANTBITS+1 bits, or 0x000000000000FF_FFFF
 +     * So we need to normalize right while the high word is non-zero and
 +     * while the low word is nonzero when masked with 0xffe0_0000_0000_0000
 +     */
 +    while ((int128_gethi(a.mant) != 0) ||
 +           ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0)) {
 +        a = accum_norm_right(a, 1);
 +    }
 +    /*
 +     * OK, now normalize left
 +     * We want to normalize left until we have a leading one in bit 24
 +     * Theoretically, we only need to shift a maximum of one to the left if we
 +     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky
 +     * should be 0
 +     */
 +    while ((int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) == 0) {
 +        a = accum_norm_left(a);
 +    }
 +    /*
 +     * OK, now we might need to denormalize because of potential underflow.
 +     * We need to do this before rounding, and rounding might make us normal
 +     * again
 +     */
 +    while (a.exp <= 0) {
 +        a = accum_norm_right(a, 1 - a.exp);
 +        /*
 +         * Do we have underflow?
 +         * That's when we get an inexact answer because we ran out of bits
 +         * in a denormal.
 +         */
 +        if (a.guard || a.round || a.sticky) {
 +            float_raise(float_flag_underflow, fp_status);
 +        }
 +    }
 +    /* OK, we're relatively canonical... now we need to round */
 +    if (a.guard || a.round || a.sticky) {
 +        float_raise(float_flag_inexact, fp_status);
 +        switch (fp_status->float_rounding_mode) {
 +        case float_round_to_zero:
 +            /* Chop and we're done */
 +            break;
 +        case float_round_up:
 +            if (a.sign == 0) {
 +                a.mant = int128_add(a.mant, int128_one());
 +            }
 +            break;
 +        case float_round_down:
 +            if (a.sign != 0) {
 +                a.mant = int128_add(a.mant, int128_one());
 +            }
 +            break;
 +        default:
 +            if (a.round || a.sticky) {
 +                /* round up if guard is 1, down if guard is zero */
 +                a.mant = int128_add(a.mant, int128_make64(a.guard));
 +            } else if (a.guard) {
 +                /* exactly .5, round up if odd */
 +                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one()));
 +            }
 +            break;
 +        }
 +    }
 +    /*
 +     * OK, now we might have carried all the way up.
 +     * So we might need to shr once
 +     * at least we know that the lsb should be zero if we rounded and
 +     * got a carry out...
 +     */
 +    if ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0) {
 +        a = accum_norm_right(a, 1);
 +    }
 +    /* Overflow? */
 +    if (a.exp >= DF_INF_EXP) {
 +        /* Yep, inf result */
 +        float_raise(float_flag_overflow, fp_status);
 +        float_raise(float_flag_inexact, fp_status);
 +        switch (fp_status->float_rounding_mode) {
 +        case float_round_to_zero:
 +            return maxfinite_float64(a.sign);
 +        case float_round_up:
 +            if (a.sign == 0) {
 +                return infinite_float64(a.sign);
 +            } else {
 +                return maxfinite_float64(a.sign);
 +            }
 +        case float_round_down:
 +            if (a.sign != 0) {
 +                return infinite_float64(a.sign);
 +            } else {
 +                return maxfinite_float64(a.sign);
 +            }
 +        default:
 +            return infinite_float64(a.sign);
 +        }
 +    }
 +    /* Underflow? */
 +    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
 +        /* Leading one means: No, we're normal. So, we should be done... */
 +        Double ret;
 +        ret.i = 0;
 +        ret.sign = a.sign;
 +        ret.exp = a.exp;
 +        ret.mant = int128_getlo(a.mant);
 +        return ret.i;
 +    }
 +    assert(a.exp == 1);
 +    Double ret;
 +    ret.i = 0;
 +    ret.sign = a.sign;
 +    ret.exp = 0;
 +    ret.mant = int128_getlo(a.mant);
 +    return ret.i;
  }
 -GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
 -
  float64 internal_mpyhh(float64 a, float64 b,
                        unsigned long long int accumulated,
                        float_status *fp_status)
 --
-.34.1
+.43.0

-[PULL 05/52] tcg: Move TCG_TYPE_TL from tcg.h to tcg-op.h
+[PULL 68/72] target/hexagon: Remove Float
-Removes the only use of TARGET_LONG_BITS from tcg.h, which is to be
+This structure, with bitfields, is incorrect for big-endian.
-target independent.  Move the symbol to a define in tcg-op.h, which
+Use the existing float32_getexp_raw which uses extract32.
 will continue to be target dependent.  Rather than complicate matters
 for the use in tb_gen_code(), expand the definition there.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op.h      | 8 ++++++++
+ target/hexagon/fma_emu.c | 16 +++-------------
- include/tcg/tcg.h         | 7 -------
+file changed, 3 insertions(+), 13 deletions(-)
  accel/tcg/translate-all.c | 2 +-
 files changed, 9 insertions(+), 8 deletions(-)
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/target/hexagon/fma_emu.c
-+++ b/include/tcg/tcg-op.h
++++ b/target/hexagon/fma_emu.c
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
+@@ -XXX,XX +XXX,XX @@ typedef union {
- #error must include QEMU headers
+     };
- #endif
+ } Double;
-+#if TARGET_LONG_BITS == 32
+-typedef union {
-+# define TCG_TYPE_TL  TCG_TYPE_I32
+-    float f;
-+#elif TARGET_LONG_BITS == 64
+-    uint32_t i;
-+# define TCG_TYPE_TL  TCG_TYPE_I64
+-    struct {
-+#else
+-        uint32_t mant:23;
-+# error
+-        uint32_t exp:8;
-+#endif
+-        uint32_t sign:1;
-+
+-    };
- #if TARGET_INSN_START_WORDS == 1
+-} Float;
- static inline void tcg_gen_insn_start(target_ulong pc)
+-
  static uint64_t float64_getmant(float64 f64)
  {
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+     Double a = { .i = f64 };
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
---- a/include/tcg/tcg.h
-+++ b/include/tcg/tcg.h
+ int32_t float32_getexp(float32 f32)
-@@ -XXX,XX +XXX,XX @@ typedef enum TCGType {
+ {
- #else
+-    Float a = { .i = f32 };
-     TCG_TYPE_PTR = TCG_TYPE_I64,
++    int exp = float32_getexp_raw(f32);
- #endif
+     if (float32_is_normal(f32)) {
--
+-        return a.exp;
--    /* An alias for the size of the target "long", aka register.  */
++        return exp;
--#if TARGET_LONG_BITS == 64
+     }
--    TCG_TYPE_TL = TCG_TYPE_I64,
+     if (float32_is_denormal(f32)) {
--#else
+-        return a.exp + 1;
--    TCG_TYPE_TL = TCG_TYPE_I32,
++        return exp + 1;
--#endif
+     }
- } TCGType;
+     return -1;
+ }
  /**
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
      tb_set_page_addr0(tb, phys_pc);
      tb_set_page_addr1(tb, -1);
      tcg_ctx->gen_tb = tb;
 -    tcg_ctx->addr_type = TCG_TYPE_TL;
 +    tcg_ctx->addr_type = TARGET_LONG_BITS == 32 ? TCG_TYPE_I32 : TCG_TYPE_I64;
  #ifdef CONFIG_SOFTMMU
      tcg_ctx->page_bits = TARGET_PAGE_BITS;
      tcg_ctx->page_mask = TARGET_PAGE_MASK;
 --
-.34.1
+.43.0

-[PULL 08/52] target/avr: Add missing includes of qemu/error-report.h
+[PULL 69/72] target/hexagon: Remove Double
-This had been pulled in from tcg/tcg.h, via exec/cpu_ldst.h,
+This structure, with bitfields, is incorrect for big-endian.
-via exec/exec-all.h, but the include of tcg.h will be removed.
+Use extract64 and deposit64 instead.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/avr/helper.c | 1 +
+ target/hexagon/fma_emu.c | 46 ++++++++++++++--------------------------
-file changed, 1 insertion(+)
+file changed, 16 insertions(+), 30 deletions(-)
-diff --git a/target/avr/helper.c b/target/avr/helper.c
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/avr/helper.c
+--- a/target/hexagon/fma_emu.c
-+++ b/target/avr/helper.c
++++ b/target/hexagon/fma_emu.c
 @@ -XXX,XX +XXX,XX @@
- #include "qemu/osdep.h"
+ #define WAY_BIG_EXP 4096
- #include "qemu/log.h"
-+#include "qemu/error-report.h"
+-typedef union {
- #include "cpu.h"
+-    double f;
- #include "hw/core/tcg-cpu-ops.h"
+-    uint64_t i;
- #include "exec/exec-all.h"
+-    struct {
 -        uint64_t mant:52;
 -        uint64_t exp:11;
 -        uint64_t sign:1;
 -    };
 -} Double;
 -
  static uint64_t float64_getmant(float64 f64)
  {
 -    Double a = { .i = f64 };
 +    uint64_t mant = extract64(f64, 0, 52);
      if (float64_is_normal(f64)) {
 -        return a.mant | 1ULL << 52;
 +        return mant | 1ULL << 52;
      }
      if (float64_is_zero(f64)) {
          return 0;
      }
      if (float64_is_denormal(f64)) {
 -        return a.mant;
 +        return mant;
      }
      return ~0ULL;
  }
  int32_t float64_getexp(float64 f64)
  {
 -    Double a = { .i = f64 };
 +    int exp = extract64(f64, 52, 11);
      if (float64_is_normal(f64)) {
 -        return a.exp;
 +        return exp;
      }
      if (float64_is_denormal(f64)) {
 -        return a.exp + 1;
 +        return exp + 1;
      }
      return -1;
  }
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
  /* Return a maximum finite value with the requested sign */
  static float64 accum_round_float64(Accum a, float_status *fp_status)
  {
 +    uint64_t ret;
 +
      if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
          && ((a.guard | a.round | a.sticky) == 0)) {
          /* result zero */
@@ -XXX,XX +XXX,XX @@ static float64 accum_round_float64(Accum a, float_status *fp_status)
          }
      }
      /* Underflow? */
 -    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
 +    ret = int128_getlo(a.mant);
 +    if (ret & (1ULL << DF_MANTBITS)) {
          /* Leading one means: No, we're normal. So, we should be done... */
 -        Double ret;
 -        ret.i = 0;
 -        ret.sign = a.sign;
 -        ret.exp = a.exp;
 -        ret.mant = int128_getlo(a.mant);
 -        return ret.i;
 +        ret = deposit64(ret, 52, 11, a.exp);
 +    } else {
 +        assert(a.exp == 1);
 +        ret = deposit64(ret, 52, 11, 0);
      }
 -    assert(a.exp == 1);
 -    Double ret;
 -    ret.i = 0;
 -    ret.sign = a.sign;
 -    ret.exp = 0;
 -    ret.mant = int128_getlo(a.mant);
 -    return ret.i;
 +    ret = deposit64(ret, 63, 1, a.sign);
 +    return ret;
  }
  float64 internal_mpyhh(float64 a, float64 b,
 --
-.34.1
+.43.0

-[PULL 21/52] tcg: Move temp_idx and tcgv_i32_temp debug out of line
+[PULL 70/72] target/hexagon: Use mulu64 for int128_mul_6464
-Removes a multiplicity of calls to __assert_fail, saving up
+No need to open-code 64x64->128-bit multiplication.
 to 360kiB of .text space as measured on an x86_64 host.
-Old     New     Less    %Change
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 9257272    8888680    368592    3.98%    qemu-system-aarch64
 6100968    5911832    189136    3.10%    qemu-system-riscv64
 5839112    5707032    132080    2.26%    qemu-system-mips
 4447608    4341752    105856    2.38%    qemu-system-s390x
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h | 30 ++++++++++++++++--------------
+ target/hexagon/fma_emu.c | 32 +++-----------------------------
- tcg/tcg.c         | 19 +++++++++++++++++++
+file changed, 3 insertions(+), 29 deletions(-)
 files changed, 35 insertions(+), 14 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/target/hexagon/fma_emu.c
-+++ b/include/tcg/tcg.h
++++ b/target/hexagon/fma_emu.c
-@@ -XXX,XX +XXX,XX @@ static inline void *tcg_splitwx_to_rw(const void *rx)
+@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32)
      return -1;
  }
- #endif
+-static uint32_t int128_getw0(Int128 x)
 -static inline size_t temp_idx(TCGTemp *ts)
 -{
--    ptrdiff_t n = ts - tcg_ctx->temps;
+-    return int128_getlo(x);
 -    tcg_debug_assert(n >= 0 && n < tcg_ctx->nb_temps);
 -    return n;
 -}
 -
- static inline TCGArg temp_arg(TCGTemp *ts)
+-static uint32_t int128_getw1(Int128 x)
 -{
 -    return int128_getlo(x) >> 32;
 -}
 -
  static Int128 int128_mul_6464(uint64_t ai, uint64_t bi)
  {
-     return (uintptr_t)ts;
+-    Int128 a, b;
-@@ -XXX,XX +XXX,XX @@ static inline TCGTemp *arg_temp(TCGArg a)
+-    uint64_t pp0, pp1a, pp1b, pp1s, pp2;
-     return (TCGTemp *)(uintptr_t)a;
++    uint64_t l, h;
 -    a = int128_make64(ai);
 -    b = int128_make64(bi);
 -    pp0 = (uint64_t)int128_getw0(a) * (uint64_t)int128_getw0(b);
 -    pp1a = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw0(b);
 -    pp1b = (uint64_t)int128_getw1(b) * (uint64_t)int128_getw0(a);
 -    pp2 = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw1(b);
 -
 -    pp1s = pp1a + pp1b;
 -    if ((pp1s < pp1a) || (pp1s < pp1b)) {
 -        pp2 += (1ULL << 32);
 -    }
 -    uint64_t ret_low = pp0 + (pp1s << 32);
 -    if ((ret_low < pp0) || (ret_low < (pp1s << 32))) {
 -        pp2 += 1;
 -    }
 -
 -    return int128_make128(ret_low, pp2 + (pp1s >> 32));
 +    mulu64(&l, &h, ai, bi);
 +    return int128_make128(l, h);
  }
--/* Using the offset of a temporary, relative to TCGContext, rather than
+ static Int128 int128_sub_borrow(Int128 a, Int128 b, int borrow)
 -   its index means that we don't use 0.  That leaves offset 0 free for
 -   a NULL representation without having to leave index 0 unused.  */
 +#ifdef CONFIG_DEBUG_TCG
 +size_t temp_idx(TCGTemp *ts);
 +TCGTemp *tcgv_i32_temp(TCGv_i32 v);
 +#else
 +static inline size_t temp_idx(TCGTemp *ts)
 +{
 +    return ts - tcg_ctx->temps;
 +}
 +
 +/*
 + * Using the offset of a temporary, relative to TCGContext, rather than
 + * its index means that we don't use 0.  That leaves offset 0 free for
 + * a NULL representation without having to leave index 0 unused.
 + */
  static inline TCGTemp *tcgv_i32_temp(TCGv_i32 v)
  {
 -    uintptr_t o = (uintptr_t)v;
 -    TCGTemp *t = (void *)tcg_ctx + o;
 -    tcg_debug_assert(offsetof(TCGContext, temps[temp_idx(t)]) == o);
 -    return t;
 +    return (void *)tcg_ctx + (uintptr_t)v;
  }
 +#endif
  static inline TCGTemp *tcgv_i64_temp(TCGv_i64 v)
  {
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val)
      return tcg_constant_vec(t->base_type, vece, val);
  }
 +#ifdef CONFIG_DEBUG_TCG
 +size_t temp_idx(TCGTemp *ts)
 +{
 +    ptrdiff_t n = ts - tcg_ctx->temps;
 +    assert(n >= 0 && n < tcg_ctx->nb_temps);
 +    return n;
 +}
 +
 +TCGTemp *tcgv_i32_temp(TCGv_i32 v)
 +{
 +    uintptr_t o = (uintptr_t)v - offsetof(TCGContext, temps);
 +
 +    assert(o < sizeof(TCGTemp) * tcg_ctx->nb_temps);
 +    assert(o % sizeof(TCGTemp) == 0);
 +
 +    return (void *)tcg_ctx + (uintptr_t)v;
 +}
 +#endif /* CONFIG_DEBUG_TCG */
 +
  /* Return true if OP may appear in the opcode stream.
     Test the runtime variable that controls each opcode.  */
  bool tcg_op_supported(TCGOpcode op)
 --
-.34.1
+.43.0

-[PULL 04/52] tcg/sparc64: Remove TARGET_LONG_BITS, TCG_TYPE_TL
+[PULL 71/72] target/hexagon: Simplify internal_mpyhh setup
-All uses replaced with TCGContext.addr_type.
+Initialize x with accumulated via direct assignment,
 rather than multiplying by 1.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/sparc64/tcg-target.c.inc | 7 ++++---
+ target/hexagon/fma_emu.c | 2 +-
-file changed, 4 insertions(+), 3 deletions(-)
+file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/sparc64/tcg-target.c.inc
+--- a/target/hexagon/fma_emu.c
-+++ b/tcg/sparc64/tcg-target.c.inc
++++ b/target/hexagon/fma_emu.c
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+@@ -XXX,XX +XXX,XX @@ float64 internal_mpyhh(float64 a, float64 b,
-                                            TCGReg addr_reg, MemOpIdx oi,
+         float64_is_infinity(b)) {
-                                            bool is_ld)
+         return float64_mul(a, b, fp_status);
- {
+     }
-+    TCGType addr_type = s->addr_type;
+-    x.mant = int128_mul_6464(accumulated, 1);
-     TCGLabelQemuLdst *ldst = NULL;
++    x.mant = int128_make64(accumulated);
-     MemOp opc = get_memop(oi);
+     x.sticky = sticky;
-     MemOp s_bits = opc & MO_SIZE;
+     prod = fGETUWORD(1, float64_getmant(a)) * fGETUWORD(1, float64_getmant(b));
-@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
+     x.mant = int128_add(x.mant, int128_mul_6464(prod, 0x100000000ULL));
      tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T3, ARITH_ADD);
      /* Load the tlb comparator and the addend. */
 -    tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_T2, TCG_REG_T1, cmp_off);
 +    tcg_out_ld(s, addr_type, TCG_REG_T2, TCG_REG_T1, cmp_off);
      tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T1, TCG_REG_T1, add_off);
      h->base = TCG_REG_T1;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      ldst->label_ptr[0] = s->code_ptr;
      /* bne,pn %[xi]cc, label0 */
 -    cc = TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC;
 +    cc = addr_type == TCG_TYPE_I32 ? BPCC_ICC : BPCC_XCC;
      tcg_out_bpcc0(s, COND_NE, BPCC_PN | cc, 0);
  #else
      /*
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
  #endif
      /* If the guest address must be zero-extended, do in the delay slot.  */
 -    if (TARGET_LONG_BITS == 32) {
 +    if (addr_type == TCG_TYPE_I32) {
          tcg_out_ext32u(s, TCG_REG_T2, addr_reg);
          h->index = TCG_REG_T2;
      } else {
 --
-.34.1
+.43.0

-[PULL 20/52] tcg: Pass TCGHelperInfo to tcg_gen_callN
+[PULL 72/72] accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
-In preparation for compiling tcg/ only once, eliminate
+Convert all targets simultaneously, as the gen_intermediate_code
-the all_helpers array.  Instantiate the info structs for
+function disappears from the target.  While there are possible
-the generic helpers in accel/tcg/, and the structs for
+workarounds, they're larger than simply performing the conversion.
 the target-specific helpers in each translate.c.
-Since we don't see all of the info structs at startup,
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 initialize at first use, using g_once_init_* to make
 sure we don't race while doing so.
 Reviewed-by: Anton Johansson <anjo@rev.ng>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- MAINTAINERS                    |   1 +
+ include/exec/translator.h        | 14 --------------
- include/exec/helper-gen.h      |  66 ++++++++++++--------
+ include/hw/core/tcg-cpu-ops.h    | 13 +++++++++++++
- include/exec/helper-tcg.h      |  75 -----------------------
+ target/alpha/cpu.h               |  2 ++
- include/qemu/typedefs.h        |   1 +
+ target/arm/internals.h           |  2 ++
- include/tcg/helper-info.h      |   9 ++-
+ target/avr/cpu.h                 |  2 ++
- include/tcg/tcg.h              |   2 +-
+ target/hexagon/cpu.h             |  2 ++
- accel/tcg/plugin-gen.c         |   5 ++
+ target/hppa/cpu.h                |  2 ++
- accel/tcg/tcg-runtime.c        |   4 ++
+ target/i386/tcg/helper-tcg.h     |  2 ++
- target/alpha/translate.c       |   3 +
+ target/loongarch/internals.h     |  2 ++
- target/arm/tcg/translate.c     |   3 +
+ target/m68k/cpu.h                |  2 ++
- target/avr/translate.c         |   5 ++
+ target/microblaze/cpu.h          |  2 ++
- target/cris/translate.c        |   6 +-
+ target/mips/tcg/tcg-internal.h   |  2 ++
- target/hexagon/translate.c     |   4 ++
+ target/openrisc/cpu.h            |  2 ++
- target/hppa/translate.c        |   5 ++
+ target/ppc/cpu.h                 |  2 ++
- target/i386/tcg/translate.c    |   5 ++
+ target/riscv/cpu.h               |  3 +++
- target/loongarch/translate.c   |   4 ++
+ target/rx/cpu.h                  |  2 ++
- target/m68k/translate.c        |   3 +
+ target/s390x/s390x-internal.h    |  2 ++
- target/microblaze/translate.c  |   4 ++
+ target/sh4/cpu.h                 |  2 ++
- target/mips/tcg/translate.c    |   5 ++
+ target/sparc/cpu.h               |  2 ++
- target/nios2/translate.c       |   5 ++
+ target/tricore/cpu.h             |  2 ++
- target/openrisc/translate.c    |   5 ++
+ target/xtensa/cpu.h              |  2 ++
- target/ppc/translate.c         |   4 ++
+ accel/tcg/cpu-exec.c             |  8 +++++---
- target/riscv/translate.c       |   4 ++
+ accel/tcg/translate-all.c        |  8 +++++---
- target/rx/translate.c          |   5 ++
+ target/alpha/cpu.c               |  1 +
- target/s390x/tcg/translate.c   |   4 ++
+ target/alpha/translate.c         |  4 ++--
- target/sh4/translate.c         |   4 ++
+ target/arm/cpu.c                 |  1 +
- target/sparc/translate.c       |   3 +
+ target/arm/tcg/cpu-v7m.c         |  1 +
- target/tricore/translate.c     |   5 ++
+ target/arm/tcg/translate.c       |  5 ++---
- target/xtensa/translate.c      |   4 ++
+ target/avr/cpu.c                 |  1 +
- tcg/tcg.c                      | 108 ++++++++++++---------------------
+ target/avr/translate.c           |  6 +++---
- include/exec/helper-info.c.inc |  96 +++++++++++++++++++++++++++++
+ target/hexagon/cpu.c             |  1 +
-files changed, 282 insertions(+), 175 deletions(-)
+ target/hexagon/translate.c       |  4 ++--
- delete mode 100644 include/exec/helper-tcg.h
+ target/hppa/cpu.c                |  1 +
- create mode 100644 include/exec/helper-info.c.inc
+ target/hppa/translate.c          |  4 ++--
  target/i386/tcg/tcg-cpu.c        |  1 +
  target/i386/tcg/translate.c      |  5 ++---
  target/loongarch/cpu.c           |  1 +
  target/loongarch/tcg/translate.c |  4 ++--
  target/m68k/cpu.c                |  1 +
  target/m68k/translate.c          |  4 ++--
  target/microblaze/cpu.c          |  1 +
  target/microblaze/translate.c    |  4 ++--
  target/mips/cpu.c                |  1 +
  target/mips/tcg/translate.c      |  4 ++--
  target/openrisc/cpu.c            |  1 +
  target/openrisc/translate.c      |  4 ++--
  target/ppc/cpu_init.c            |  1 +
  target/ppc/translate.c           |  4 ++--
  target/riscv/tcg/tcg-cpu.c       |  1 +
  target/riscv/translate.c         |  4 ++--
  target/rx/cpu.c                  |  1 +
  target/rx/translate.c            |  4 ++--
  target/s390x/cpu.c               |  1 +
  target/s390x/tcg/translate.c     |  4 ++--
  target/sh4/cpu.c                 |  1 +
  target/sh4/translate.c           |  4 ++--
  target/sparc/cpu.c               |  1 +
  target/sparc/translate.c         |  4 ++--
  target/tricore/cpu.c             |  1 +
  target/tricore/translate.c       |  5 ++---
  target/xtensa/cpu.c              |  1 +
  target/xtensa/translate.c        |  4 ++--
 files changed, 121 insertions(+), 62 deletions(-)
-diff --git a/MAINTAINERS b/MAINTAINERS
+diff --git a/include/exec/translator.h b/include/exec/translator.h
 index XXXXXXX..XXXXXXX 100644
---- a/MAINTAINERS
+--- a/include/exec/translator.h
-+++ b/MAINTAINERS
++++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@ F: include/exec/exec-all.h
  F: include/exec/tb-flush.h
  F: include/exec/target_long.h
  F: include/exec/helper*.h
 +F: include/exec/helper-info.c.inc
  F: include/sysemu/cpus.h
  F: include/sysemu/tcg.h
  F: include/hw/core/tcg-cpu-ops.h
 diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/helper-gen.h
 +++ b/include/exec/helper-gen.h
 @@ -XXX,XX +XXX,XX @@
--/* Helper file for declaring TCG helper functions.
+ #include "qemu/bswap.h"
--   This one expands generation functions for tcg opcodes.  */
+ #include "exec/vaddr.h"
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
-+/*
+-/**
-+ * Helper file for declaring TCG helper functions.
+- * gen_intermediate_code
-+ * This one expands generation functions for tcg opcodes.
+- * @cpu: cpu context
-+ * Define HELPER_H for the header file to be expanded,
+- * @tb: translation block
-+ * and static inline to change from global file scope.
+- * @max_insns: max number of instructions to translate
-+ */
+- * @pc: guest virtual program counter address
+- * @host_pc: host physical program counter address
- #ifndef HELPER_GEN_H
+- *
- #define HELPER_GEN_H
+- * This function must be provided by the target, which should create
+- * the target-specific DisasContext, and then invoke translator_loop.
-+#include "tcg/tcg.h"
+- */
-+#include "tcg/helper-info.h"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
- #include "exec/helper-head.h"
+-                           vaddr pc, void *host_pc);
+-
- #define DEF_HELPER_FLAGS_0(name, flags, ret)                            \
+ /**
-+extern TCGHelperInfo glue(helper_info_, name);                          \
+  * DisasJumpType:
- static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret))        \
+  * @DISAS_NEXT: Next instruction in program order.
- {                                                                       \
+diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
--  tcg_gen_callN(HELPER(name), dh_retvar(ret), 0, NULL);                 \
+index XXXXXXX..XXXXXXX 100644
-+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 0, NULL);  \
+--- a/include/hw/core/tcg-cpu-ops.h
 +++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@ struct TCGCPUOps {
       * Called when the first CPU is realized.
       */
      void (*initialize)(void);
 +    /**
 +     * @translate_code: Translate guest instructions to TCGOps
 +     * @cpu: cpu context
 +     * @tb: translation block
 +     * @max_insns: max number of instructions to translate
 +     * @pc: guest virtual program counter address
 +     * @host_pc: host physical program counter address
 +     *
 +     * This function must be provided by the target, which should create
 +     * the target-specific DisasContext, and then invoke translator_loop.
 +     */
 +    void (*translate_code)(CPUState *cpu, TranslationBlock *tb,
 +                           int *max_insns, vaddr pc, void *host_pc);
      /**
       * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
       *
 diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.h
 +++ b/target/alpha/cpu.h
@@ -XXX,XX +XXX,XX @@ enum {
  };
  void alpha_translate_init(void);
 +void alpha_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
  #define CPU_RESOLVING_TYPE TYPE_ALPHA_CPU
 diff --git a/target/arm/internals.h b/target/arm/internals.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/internals.h
 +++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ void init_cpreg_list(ARMCPU *cpu);
  void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
  void arm_translate_init(void);
 +void arm_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  void arm_cpu_register_gdb_commands(ARMCPU *cpu);
  void aarch64_cpu_register_gdb_commands(ARMCPU *cpu, GString *,
 diff --git a/target/avr/cpu.h b/target/avr/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.h
 +++ b/target/avr/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void set_avr_feature(CPUAVRState *env, int feature)
  }
- #define DEF_HELPER_FLAGS_1(name, flags, ret, t1)                        \
+ void avr_cpu_tcg_init(void);
-+extern TCGHelperInfo glue(helper_info_, name);                          \
++void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
- static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
++                            int *max_insns, vaddr pc, void *host_pc);
-     dh_arg_decl(t1, 1))                                                 \
- {                                                                       \
+ int cpu_avr_exec(CPUState *cpu);
--  TCGTemp *args[1] = { dh_arg(t1, 1) };                                 \
--  tcg_gen_callN(HELPER(name), dh_retvar(ret), 1, args);                 \
+diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
-+    TCGTemp *args[1] = { dh_arg(t1, 1) };                               \
+index XXXXXXX..XXXXXXX 100644
-+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 1, args);  \
+--- a/target/hexagon/cpu.h
 +++ b/target/hexagon/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc,
  typedef HexagonCPU ArchCPU;
  void hexagon_translate_init(void);
 +void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc);
  #include "exec/cpu-all.h"
 diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.h
 +++ b/target/hppa/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline int HPPA_BTLB_ENTRIES(CPUHPPAState *env)
  }
- #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2)                    \
+ void hppa_translate_init(void);
-+extern TCGHelperInfo glue(helper_info_, name);                          \
++void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
- static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
++                         int *max_insns, vaddr pc, void *host_pc);
-     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2))                             \
- {                                                                       \
+ #define CPU_RESOLVING_TYPE TYPE_HPPA_CPU
--  TCGTemp *args[2] = { dh_arg(t1, 1), dh_arg(t2, 2) };                  \
--  tcg_gen_callN(HELPER(name), dh_retvar(ret), 2, args);                 \
+diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
-+    TCGTemp *args[2] = { dh_arg(t1, 1), dh_arg(t2, 2) };                \
+index XXXXXXX..XXXXXXX 100644
-+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 2, args);  \
+--- a/target/i386/tcg/helper-tcg.h
 +++ b/target/i386/tcg/helper-tcg.h
@@ -XXX,XX +XXX,XX @@ static inline target_long lshift(target_long x, int n)
  /* translate.c */
  void tcg_x86_init(void);
 +void x86_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  /* excp_helper.c */
  G_NORETURN void raise_exception(CPUX86State *env, int exception_index);
 diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/internals.h
 +++ b/target/loongarch/internals.h
@@ -XXX,XX +XXX,XX @@
  #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS)
  void loongarch_translate_init(void);
 +void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
 +                              int *max_insns, vaddr pc, void *host_pc);
  void G_NORETURN do_raise_exception(CPULoongArchState *env,
                                     uint32_t exception,
 diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.h
 +++ b/target/m68k/cpu.h
@@ -XXX,XX +XXX,XX @@ int m68k_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
  int m68k_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
  void m68k_tcg_init(void);
 +void m68k_translate_code(CPUState *cs, TranslationBlock *tb,
 +                         int *max_insns, vaddr pc, void *host_pc);
  void m68k_cpu_init_gdb(M68kCPU *cpu);
  uint32_t cpu_m68k_get_ccr(CPUM68KState *env);
  void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t);
 diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.h
 +++ b/target/microblaze/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void mb_cpu_write_msr(CPUMBState *env, uint32_t val)
  }
- #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3)                \
+ void mb_tcg_init(void);
-+extern TCGHelperInfo glue(helper_info_, name);                          \
++void mb_translate_code(CPUState *cs, TranslationBlock *tb,
- static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
++                       int *max_insns, vaddr pc, void *host_pc);
-     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3))         \
- {                                                                       \
+ #define CPU_RESOLVING_TYPE TYPE_MICROBLAZE_CPU
--  TCGTemp *args[3] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3) };   \
--  tcg_gen_callN(HELPER(name), dh_retvar(ret), 3, args);                 \
+diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h
-+    TCGTemp *args[3] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3) }; \
+index XXXXXXX..XXXXXXX 100644
-+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 3, args);  \
+--- a/target/mips/tcg/tcg-internal.h
- }
++++ b/target/mips/tcg/tcg-internal.h
  #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4)            \
 +extern TCGHelperInfo glue(helper_info_, name);                          \
  static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
      dh_arg_decl(t1, 1), dh_arg_decl(t2, 2),                             \
      dh_arg_decl(t3, 3), dh_arg_decl(t4, 4))                             \
  {                                                                       \
 -  TCGTemp *args[4] = { dh_arg(t1, 1), dh_arg(t2, 2),                    \
 -                     dh_arg(t3, 3), dh_arg(t4, 4) };                    \
 -  tcg_gen_callN(HELPER(name), dh_retvar(ret), 4, args);                 \
 +    TCGTemp *args[4] = { dh_arg(t1, 1), dh_arg(t2, 2),                  \
 +                         dh_arg(t3, 3), dh_arg(t4, 4) };                \
 +    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 4, args);  \
  }
  #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5)        \
 +extern TCGHelperInfo glue(helper_info_, name);                          \
  static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
      dh_arg_decl(t4, 4), dh_arg_decl(t5, 5))                             \
  {                                                                       \
 -  TCGTemp *args[5] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
 -                     dh_arg(t4, 4), dh_arg(t5, 5) };                    \
 -  tcg_gen_callN(HELPER(name), dh_retvar(ret), 5, args);                 \
 +    TCGTemp *args[5] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
 +                         dh_arg(t4, 4), dh_arg(t5, 5) };                \
 +    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 5, args);  \
  }
  #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6)    \
 +extern TCGHelperInfo glue(helper_info_, name);                          \
  static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
      dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6))         \
  {                                                                       \
 -  TCGTemp *args[6] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
 -                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6) };     \
 -  tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
 +    TCGTemp *args[6] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
 +                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6) }; \
 +    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 6, args);  \
  }
  #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
 +extern TCGHelperInfo glue(helper_info_, name);                          \
  static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 -    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
 +    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
      dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
      dh_arg_decl(t7, 7))                                                 \
  {                                                                       \
 -  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
 -                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
 -                     dh_arg(t7, 7) };                                   \
 -  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
 +    TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
 +                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),   \
 +                         dh_arg(t7, 7) };                               \
 +    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 7, args);  \
  }
  #include "helper.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
  #undef DEF_HELPER_FLAGS_5
  #undef DEF_HELPER_FLAGS_6
  #undef DEF_HELPER_FLAGS_7
 -#undef GEN_HELPER
  #endif /* HELPER_GEN_H */
 diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/include/exec/helper-tcg.h
 +++ /dev/null
 @@ -XXX,XX +XXX,XX @@
--/* Helper file for declaring TCG helper functions.
+ #include "cpu.h"
--   This one defines data structures private to tcg.c.  */
--
+ void mips_tcg_init(void);
--#ifndef HELPER_TCG_H
++void mips_translate_code(CPUState *cs, TranslationBlock *tb,
--#define HELPER_TCG_H
++                         int *max_insns, vaddr pc, void *host_pc);
--
--#include "exec/helper-head.h"
+ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb);
--
+ G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
--/* Need one more level of indirection before stringification
+diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h
--   to get all the macros expanded first.  */
+index XXXXXXX..XXXXXXX 100644
--#define str(s) #s
+--- a/target/openrisc/cpu.h
--
++++ b/target/openrisc/cpu.h
--#define DEF_HELPER_FLAGS_0(NAME, FLAGS, ret) \
+@@ -XXX,XX +XXX,XX @@ void openrisc_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
--  { .func = HELPER(NAME), .name = str(NAME), \
+ int openrisc_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
--    .flags = FLAGS | dh_callflag(ret), \
+ int openrisc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
--    .typemask = dh_typemask(ret, 0) },
+ void openrisc_translate_init(void);
--
++void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
--#define DEF_HELPER_FLAGS_1(NAME, FLAGS, ret, t1) \
++                             int *max_insns, vaddr pc, void *host_pc);
--  { .func = HELPER(NAME), .name = str(NAME), \
+ int print_insn_or1k(bfd_vma addr, disassemble_info *info);
--    .flags = FLAGS | dh_callflag(ret), \
--    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) },
+ #ifndef CONFIG_USER_ONLY
--
+diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
--#define DEF_HELPER_FLAGS_2(NAME, FLAGS, ret, t1, t2) \
+index XXXXXXX..XXXXXXX 100644
--  { .func = HELPER(NAME), .name = str(NAME), \
+--- a/target/ppc/cpu.h
--    .flags = FLAGS | dh_callflag(ret), \
++++ b/target/ppc/cpu.h
--    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
+@@ -XXX,XX +XXX,XX @@ extern const VMStateDescription vmstate_ppc_cpu;
--    | dh_typemask(t2, 2) },
--
+ /*****************************************************************************/
--#define DEF_HELPER_FLAGS_3(NAME, FLAGS, ret, t1, t2, t3) \
+ void ppc_translate_init(void);
--  { .func = HELPER(NAME), .name = str(NAME), \
++void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
--    .flags = FLAGS | dh_callflag(ret), \
++                        int *max_insns, vaddr pc, void *host_pc);
--    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
--    | dh_typemask(t2, 2) | dh_typemask(t3, 3) },
+ #if !defined(CONFIG_USER_ONLY)
--
+ void ppc_store_sdr1(CPUPPCState *env, target_ulong value);
--#define DEF_HELPER_FLAGS_4(NAME, FLAGS, ret, t1, t2, t3, t4) \
+diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
--  { .func = HELPER(NAME), .name = str(NAME), \
+index XXXXXXX..XXXXXXX 100644
--    .flags = FLAGS | dh_callflag(ret), \
+--- a/target/riscv/cpu.h
--    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
++++ b/target/riscv/cpu.h
--    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) },
+@@ -XXX,XX +XXX,XX @@ RISCVException smstateen_acc_ok(CPURISCVState *env, int index, uint64_t bit);
--
+ void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv, bool virt_en);
--#define DEF_HELPER_FLAGS_5(NAME, FLAGS, ret, t1, t2, t3, t4, t5) \
--  { .func = HELPER(NAME), .name = str(NAME), \
+ void riscv_translate_init(void);
--    .flags = FLAGS | dh_callflag(ret), \
++void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
--    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
++                          int *max_insns, vaddr pc, void *host_pc);
 -    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
 -    | dh_typemask(t5, 5) },
 -
 -#define DEF_HELPER_FLAGS_6(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6) \
 -  { .func = HELPER(NAME), .name = str(NAME), \
 -    .flags = FLAGS | dh_callflag(ret), \
 -    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
 -    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
 -    | dh_typemask(t5, 5) | dh_typemask(t6, 6) },
 -
 -#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
 -  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
 -    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
 -    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
 -    | dh_typemask(t5, 5) | dh_typemask(t6, 6) | dh_typemask(t7, 7) },
 -
 -#include "helper.h"
 -#include "accel/tcg/tcg-runtime.h"
 -#include "accel/tcg/plugin-helpers.h"
 -
 -#undef str
 -#undef DEF_HELPER_FLAGS_0
 -#undef DEF_HELPER_FLAGS_1
 -#undef DEF_HELPER_FLAGS_2
 -#undef DEF_HELPER_FLAGS_3
 -#undef DEF_HELPER_FLAGS_4
 -#undef DEF_HELPER_FLAGS_5
 -#undef DEF_HELPER_FLAGS_6
 -#undef DEF_HELPER_FLAGS_7
 -
 -#endif /* HELPER_TCG_H */
 diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/typedefs.h
 +++ b/include/qemu/typedefs.h
@@ -XXX,XX +XXX,XX @@ typedef struct ReservedRegion ReservedRegion;
  typedef struct SavedIOTLB SavedIOTLB;
  typedef struct SHPCDevice SHPCDevice;
  typedef struct SSIBus SSIBus;
 +typedef struct TCGHelperInfo TCGHelperInfo;
  typedef struct TranslationBlock TranslationBlock;
  typedef struct VirtIODevice VirtIODevice;
  typedef struct Visitor Visitor;
 diff --git a/include/tcg/helper-info.h b/include/tcg/helper-info.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/helper-info.h
 +++ b/include/tcg/helper-info.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGCallArgumentLoc {
      unsigned tmp_subindex       : 2;
  } TCGCallArgumentLoc;
 -typedef struct TCGHelperInfo {
 +struct TCGHelperInfo {
      void *func;
      const char *name;
 +
-+    /* Used with g_once_init_enter. */
+ G_NORETURN void riscv_raise_exception(CPURISCVState *env,
- #ifdef CONFIG_TCG_INTERPRETER
+                                       uint32_t exception, uintptr_t pc);
-     ffi_cif *cif;
-+#else
+diff --git a/target/rx/cpu.h b/target/rx/cpu.h
-+    uintptr_t init;
+index XXXXXXX..XXXXXXX 100644
- #endif
+--- a/target/rx/cpu.h
 +++ b/target/rx/cpu.h
@@ -XXX,XX +XXX,XX @@ int rx_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
  int rx_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
  void rx_translate_init(void);
 +void rx_translate_code(CPUState *cs, TranslationBlock *tb,
 +                       int *max_insns, vaddr pc, void *host_pc);
  void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte);
  #include "exec/cpu-all.h"
 diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/s390x-internal.h
 +++ b/target/s390x/s390x-internal.h
@@ -XXX,XX +XXX,XX @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
  /* translate.c */
  void s390x_translate_init(void);
 +void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
  void s390x_restore_state_to_opc(CPUState *cs,
                                  const TranslationBlock *tb,
                                  const uint64_t *data);
 diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.h
 +++ b/target/sh4/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                                 uintptr_t retaddr);
  void sh4_translate_init(void);
 +void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  #if !defined(CONFIG_USER_ONLY)
  hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.h
 +++ b/target/sparc/cpu.h
@@ -XXX,XX +XXX,XX @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
  /* translate.c */
  void sparc_tcg_init(void);
 +void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
  /* fop_helper.c */
  target_ulong cpu_get_fsr(CPUSPARCState *);
 diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.h
 +++ b/target/tricore/cpu.h
@@ -XXX,XX +XXX,XX @@ FIELD(TB_FLAGS, PRIV, 0, 2)
  void cpu_state_reset(CPUTriCoreState *s);
  void tricore_tcg_init(void);
 +void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc);
  static inline void cpu_get_tb_cpu_state(CPUTriCoreState *env, vaddr *pc,
                                          uint64_t *cs_base, uint32_t *flags)
 diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.h
 +++ b/target/xtensa/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void xtensa_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
  void xtensa_collect_sr_names(const XtensaConfig *config);
  void xtensa_translate_init(void);
 +void xtensa_translate_code(CPUState *cs, TranslationBlock *tb,
 +                           int *max_insns, vaddr pc, void *host_pc);
  void **xtensa_get_regfile_by_name(const char *name, int entries, int bits);
  void xtensa_breakpoint_handler(CPUState *cs);
  void xtensa_register_core(XtensaConfigList *node);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp)
      if (!tcg_target_initialized) {
          /* Check mandatory TCGCPUOps handlers */
 +        const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops;
  #ifndef CONFIG_USER_ONLY
 -        assert(cpu->cc->tcg_ops->cpu_exec_halt);
 -        assert(cpu->cc->tcg_ops->cpu_exec_interrupt);
 +        assert(tcg_ops->cpu_exec_halt);
 +        assert(tcg_ops->cpu_exec_interrupt);
  #endif /* !CONFIG_USER_ONLY */
 -        cpu->cc->tcg_ops->initialize();
 +        assert(tcg_ops->translate_code);
 +        tcg_ops->initialize();
          tcg_target_initialized = true;
      }
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
      tcg_func_start(tcg_ctx);
 -    tcg_ctx->cpu = env_cpu(env);
 -    gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
 +    CPUState *cs = env_cpu(env);
 +    tcg_ctx->cpu = cs;
 +    cs->cc->tcg_ops->translate_code(cs, tb, max_insns, pc, host_pc);
 +
-     unsigned typemask           : 32;
+     assert(tb->size != 0);
-     unsigned flags              : 8;
+     tcg_ctx->cpu = NULL;
-     unsigned nr_in              : 8;
+     *max_insns = tb->icount;
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGHelperInfo {
+@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
+             /*
-     /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
+              * Overflow of code_gen_buffer, or the current slice of it.
-     TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
+              *
--} TCGHelperInfo;
+-             * TODO: We don't need to re-do gen_intermediate_code, nor
-+};
++             * TODO: We don't need to re-do tcg_ops->translate_code, nor
+              * should we re-do the tcg optimization currently hidden
- #endif /* TCG_HELPER_INFO_H */
+              * inside tcg_gen_code.  All that should be required is to
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+              * flush the TBs, allocate a new TB, re-initialize it per
-index XXXXXXX..XXXXXXX 100644
+diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
---- a/include/tcg/tcg.h
+index XXXXXXX..XXXXXXX 100644
-+++ b/include/tcg/tcg.h
+--- a/target/alpha/cpu.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGTargetOpDef {
++++ b/target/alpha/cpu.c
+@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
- bool tcg_op_supported(TCGOpcode op);
+ static const TCGCPUOps alpha_tcg_ops = {
--void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args);
+     .initialize = alpha_translate_init,
-+void tcg_gen_callN(TCGHelperInfo *, TCGTemp *ret, int nargs, TCGTemp **args);
++    .translate_code = alpha_translate_code,
+     .synchronize_from_tb = alpha_cpu_synchronize_from_tb,
- TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs);
+     .restore_state_to_opc = alpha_restore_state_to_opc,
- void tcg_op_remove(TCGContext *s, TCGOp *op);
 diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/plugin-gen.c
 +++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/exec-all.h"
  #include "exec/plugin-gen.h"
  #include "exec/translator.h"
 +#include "exec/helper-proto.h"
 +
 +#define HELPER_H  "accel/tcg/plugin-helpers.h"
 +#include "exec/helper-info.c.inc"
 +#undef  HELPER_H
  #ifdef CONFIG_SOFTMMU
  # define CONFIG_SOFTMMU_GATE 1
 diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-runtime.c
 +++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/log.h"
  #include "tcg/tcg.h"
 +#define HELPER_H  "accel/tcg/tcg-runtime.h"
 +#include "exec/helper-info.c.inc"
 +#undef  HELPER_H
 +
  /* 32-bit helpers */
  int32_t HELPER(div_i32)(int32_t arg1, int32_t arg2)
 diff --git a/target/alpha/translate.c b/target/alpha/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/translate.c
 +++ b/target/alpha/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps alpha_tr_ops = {
- #include "exec/translator.h"
+     .tb_stop            = alpha_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void alpha_translate_code(CPUState *cpu, TranslationBlock *tb,
++                          int *max_insns, vaddr pc, void *host_pc)
- #undef ALPHA_DEBUG_DISAS
+ {
- #define CONFIG_SOFTFLOAT_INLINE
+     DisasContext dc;
      translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base);
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps arm_sysemu_ops = {
  #ifdef CONFIG_TCG
  static const TCGCPUOps arm_tcg_ops = {
      .initialize = arm_translate_init,
 +    .translate_code = arm_translate_code,
      .synchronize_from_tb = arm_cpu_synchronize_from_tb,
      .debug_excp_handler = arm_debug_excp_handler,
      .restore_state_to_opc = arm_restore_state_to_opc,
 diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/cpu-v7m.c
 +++ b/target/arm/tcg/cpu-v7m.c
@@ -XXX,XX +XXX,XX @@ static void cortex_m55_initfn(Object *obj)
  static const TCGCPUOps arm_v7m_tcg_ops = {
      .initialize = arm_translate_init,
 +    .translate_code = arm_translate_code,
      .synchronize_from_tb = arm_cpu_synchronize_from_tb,
      .debug_excp_handler = arm_debug_excp_handler,
      .restore_state_to_opc = arm_restore_state_to_opc,
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps thumb_translator_ops = {
- #include "exec/log.h"
+     .tb_stop            = arm_tr_tb_stop,
- #include "cpregs.h"
+ };
-+#define HELPER_H "helper.h"
+-/* generate intermediate code for basic block 'tb'.  */
-+#include "exec/helper-info.c.inc"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-+#undef  HELPER_H
+-                           vaddr pc, void *host_pc)
++void arm_translate_code(CPUState *cpu, TranslationBlock *tb,
- #define ENABLE_ARCH_4T    arm_dc_feature(s, ARM_FEATURE_V4T)
++                        int *max_insns, vaddr pc, void *host_pc)
- #define ENABLE_ARCH_5     arm_dc_feature(s, ARM_FEATURE_V5)
+ {
      DisasContext dc = { };
      const TranslatorOps *ops = &arm_translator_ops;
 diff --git a/target/avr/cpu.c b/target/avr/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.c
 +++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps avr_sysemu_ops = {
  static const TCGCPUOps avr_tcg_ops = {
      .initialize = avr_cpu_tcg_init,
 +    .translate_code = avr_cpu_translate_code,
      .synchronize_from_tb = avr_cpu_synchronize_from_tb,
      .restore_state_to_opc = avr_restore_state_to_opc,
      .cpu_exec_interrupt = avr_cpu_exec_interrupt,
 diff --git a/target/avr/translate.c b/target/avr/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/translate.c
 +++ b/target/avr/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool trans_WDR(DisasContext *ctx, arg_WDR *a)
- #include "exec/translator.h"
+  *
- #include "exec/gen-icount.h"
+  *    - translate()
+  *    - canonicalize_skip()
-+#define HELPER_H "helper.h"
+- *    - gen_intermediate_code()
-+#include "exec/helper-info.c.inc"
++ *    - translate_code()
-+#undef  HELPER_H
+  *    - restore_state_to_opc()
-+
+  *
-+
+  */
- /*
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps avr_tr_ops = {
-  *  Define if you want a BREAK instruction translated to a breakpoint
+     .tb_stop            = avr_tr_tb_stop,
-  *  Active debugging connection is assumed
+ };
-diff --git a/target/cris/translate.c b/target/cris/translate.c
-index XXXXXXX..XXXXXXX 100644
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
---- a/target/cris/translate.c
+-                           vaddr pc, void *host_pc)
-+++ b/target/cris/translate.c
++void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
-@@ -XXX,XX +XXX,XX @@
++                            int *max_insns, vaddr pc, void *host_pc)
- #include "exec/translator.h"
+ {
- #include "crisv32-decode.h"
+     DisasContext dc = { };
- #include "qemu/qemu-print.h"
+     translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base);
--
+diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
- #include "exec/helper-gen.h"
+index XXXXXXX..XXXXXXX 100644
--
+--- a/target/hexagon/cpu.c
- #include "exec/log.h"
++++ b/target/hexagon/cpu.c
+@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_init(Object *obj)
-+#define HELPER_H "helper.h"
-+#include "exec/helper-info.c.inc"
+ static const TCGCPUOps hexagon_tcg_ops = {
-+#undef  HELPER_H
+     .initialize = hexagon_translate_init,
-+
++    .translate_code = hexagon_translate_code,
+     .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
- #define DISAS_CRIS 0
+     .restore_state_to_opc = hexagon_restore_state_to_opc,
- #if DISAS_CRIS
+ };
 diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/translate.c
 +++ b/target/hexagon/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hexagon_tr_ops = {
- #include "genptr.h"
+     .tb_stop            = hexagon_tr_tb_stop,
- #include "printinsn.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                            int *max_insns, vaddr pc, void *host_pc)
- #include "analyze_funcs_generated.c.inc"
+ {
+     DisasContext ctx;
- typedef void (*AnalyzeInsn)(DisasContext *ctx);
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps hppa_sysemu_ops = {
  static const TCGCPUOps hppa_tcg_ops = {
      .initialize = hppa_translate_init,
 +    .translate_code = hppa_translate_code,
      .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
      .restore_state_to_opc = hppa_restore_state_to_opc,
 diff --git a/target/hppa/translate.c b/target/hppa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/translate.c
 +++ b/target/hppa/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hppa_tr_ops = {
- #include "exec/translator.h"
+ #endif
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                         int *max_insns, vaddr pc, void *host_pc)
-+
+ {
- /* Since we have a distinction between register size and address size,
+     DisasContext ctx = { };
-    we need to redefine all of these.  */
+     translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base);
+diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/tcg-cpu.c
 +++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static bool x86_debug_check_breakpoint(CPUState *cs)
  static const TCGCPUOps x86_tcg_ops = {
      .initialize = tcg_x86_init,
 +    .translate_code = x86_translate_code,
      .synchronize_from_tb = x86_cpu_synchronize_from_tb,
      .restore_state_to_opc = x86_restore_state_to_opc,
      .cpu_exec_enter = x86_cpu_exec_enter,
 diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/translate.c
 +++ b/target/i386/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps i386_tr_ops = {
+     .tb_stop            = i386_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-/* generate intermediate code for basic block 'tb'.  */
-+#include "exec/helper-info.c.inc"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-+#undef  HELPER_H
+-                           vaddr pc, void *host_pc)
-+
++void x86_translate_code(CPUState *cpu, TranslationBlock *tb,
-+
++                        int *max_insns, vaddr pc, void *host_pc)
- #define PREFIX_REPZ   0x01
+ {
- #define PREFIX_REPNZ  0x02
+     DisasContext dc;
- #define PREFIX_LOCK   0x04
-diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
+diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/loongarch/translate.c
+--- a/target/loongarch/cpu.c
-+++ b/target/loongarch/translate.c
++++ b/target/loongarch/cpu.c
-@@ -XXX,XX +XXX,XX @@ static TCGv cpu_lladdr, cpu_llval;
+@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
- #include "exec/gen-icount.h"
+ static const TCGCPUOps loongarch_tcg_ops = {
+     .initialize = loongarch_translate_init,
-+#define HELPER_H "helper.h"
++    .translate_code = loongarch_translate_code,
-+#include "exec/helper-info.c.inc"
+     .synchronize_from_tb = loongarch_cpu_synchronize_from_tb,
-+#undef  HELPER_H
+     .restore_state_to_opc = loongarch_restore_state_to_opc,
-+
- #define DISAS_STOP        DISAS_TARGET_0
+diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
- #define DISAS_EXIT        DISAS_TARGET_1
+index XXXXXXX..XXXXXXX 100644
- #define DISAS_EXIT_UPDATE DISAS_TARGET_2
+--- a/target/loongarch/tcg/translate.c
 +++ b/target/loongarch/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps loongarch_tr_ops = {
      .tb_stop            = loongarch_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
 +                              int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps m68k_sysemu_ops = {
  static const TCGCPUOps m68k_tcg_ops = {
      .initialize = m68k_tcg_init,
 +    .translate_code = m68k_translate_code,
      .restore_state_to_opc = m68k_restore_state_to_opc,
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/m68k/translate.c b/target/m68k/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/translate.c
 +++ b/target/m68k/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps m68k_tr_ops = {
- #include "exec/log.h"
+     .tb_stop            = m68k_tr_tb_stop,
- #include "fpu/softfloat.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void m68k_translate_code(CPUState *cpu, TranslationBlock *tb,
++                         int *max_insns, vaddr pc, void *host_pc)
- //#define DEBUG_DISPATCH 1
+ {
      DisasContext dc;
      translator_loop(cpu, tb, max_insns, pc, host_pc, &m68k_tr_ops, &dc.base);
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mb_sysemu_ops = {
  static const TCGCPUOps mb_tcg_ops = {
      .initialize = mb_tcg_init,
 +    .translate_code = mb_translate_code,
      .synchronize_from_tb = mb_cpu_synchronize_from_tb,
      .restore_state_to_opc = mb_restore_state_to_opc,
 diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/translate.c
 +++ b/target/microblaze/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mb_tr_ops = {
+     .tb_stop            = mb_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void mb_translate_code(CPUState *cpu, TranslationBlock *tb,
-+
++                       int *max_insns, vaddr pc, void *host_pc)
- #define EXTRACT_FIELD(src, start, end) \
+ {
-             (((src) >> start) & ((1 << (end - start + 1)) - 1))
+     DisasContext dc;
      translator_loop(cpu, tb, max_insns, pc, host_pc, &mb_tr_ops, &dc.base);
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static const Property mips_cpu_properties[] = {
  #include "hw/core/tcg-cpu-ops.h"
  static const TCGCPUOps mips_tcg_ops = {
      .initialize = mips_tcg_init,
 +    .translate_code = mips_translate_code,
      .synchronize_from_tb = mips_cpu_synchronize_from_tb,
      .restore_state_to_opc = mips_restore_state_to_opc,
 diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/translate.c
 +++ b/target/mips/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mips_tr_ops = {
- #include "fpu_helper.h"
+     .tb_stop            = mips_tr_tb_stop,
- #include "translate.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void mips_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                         int *max_insns, vaddr pc, void *host_pc)
-+
+ {
- /*
+     DisasContext ctx;
-  * Many sysemu-only helpers are not reachable for user-only.
-  * Define stub generators here, so that we need not either sprinkle
+diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
-diff --git a/target/nios2/translate.c b/target/nios2/translate.c
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX 100644
+--- a/target/openrisc/cpu.c
---- a/target/nios2/translate.c
++++ b/target/openrisc/cpu.c
-+++ b/target/nios2/translate.c
+@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps openrisc_sysemu_ops = {
-@@ -XXX,XX +XXX,XX @@
- #include "exec/gen-icount.h"
+ static const TCGCPUOps openrisc_tcg_ops = {
- #include "semihosting/semihost.h"
+     .initialize = openrisc_translate_init,
++    .translate_code = openrisc_translate_code,
-+#define HELPER_H "helper.h"
+     .synchronize_from_tb = openrisc_cpu_synchronize_from_tb,
-+#include "exec/helper-info.c.inc"
+     .restore_state_to_opc = openrisc_restore_state_to_opc,
 +#undef  HELPER_H
 +
 +
  /* is_jmp field values */
  #define DISAS_UPDATE  DISAS_TARGET_1 /* cpu state was modified dynamically */
 diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/translate.c
 +++ b/target/openrisc/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps openrisc_tr_ops = {
+     .tb_stop            = openrisc_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                             int *max_insns, vaddr pc, void *host_pc)
-+
+ {
- /* is_jmp field values */
+     DisasContext ctx;
- #define DISAS_EXIT    DISAS_TARGET_0  /* force exit to main loop */
- #define DISAS_JUMP    DISAS_TARGET_1  /* exit via jmp_pc/jmp_pc_imm */
+diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/cpu_init.c
 +++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps ppc_sysemu_ops = {
  static const TCGCPUOps ppc_tcg_ops = {
    .initialize = ppc_translate_init,
 +  .translate_code = ppc_translate_code,
    .restore_state_to_opc = ppc_restore_state_to_opc,
  #ifdef CONFIG_USER_ONLY
 diff --git a/target/ppc/translate.c b/target/ppc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate.c
 +++ b/target/ppc/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps ppc_tr_ops = {
- #include "qemu/qemu-print.h"
+     .tb_stop            = ppc_tr_tb_stop,
- #include "qapi/error.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                        int *max_insns, vaddr pc, void *host_pc)
- #define CPU_SINGLE_STEP 0x1
+ {
- #define CPU_BRANCH_STEP 0x2
+     DisasContext ctx;
 diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/tcg/tcg-cpu.c
 +++ b/target/riscv/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_restore_state_to_opc(CPUState *cs,
  static const TCGCPUOps riscv_tcg_ops = {
      .initialize = riscv_translate_init,
 +    .translate_code = riscv_translate_code,
      .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
      .restore_state_to_opc = riscv_restore_state_to_opc,
 diff --git a/target/riscv/translate.c b/target/riscv/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/translate.c
 +++ b/target/riscv/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps riscv_tr_ops = {
- #include "instmap.h"
+     .tb_stop            = riscv_tr_tb_stop,
- #include "internals.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                          int *max_insns, vaddr pc, void *host_pc)
- /* global register indices */
+ {
- static TCGv cpu_gpr[32], cpu_gprh[32], cpu_pc, cpu_vl, cpu_vstart;
+     DisasContext ctx;
- static TCGv_i64 cpu_fpr[32]; /* assume F and D extensions */
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps rx_sysemu_ops = {
  static const TCGCPUOps rx_tcg_ops = {
      .initialize = rx_translate_init,
 +    .translate_code = rx_translate_code,
      .synchronize_from_tb = rx_cpu_synchronize_from_tb,
      .restore_state_to_opc = rx_restore_state_to_opc,
      .tlb_fill = rx_cpu_tlb_fill,
 diff --git a/target/rx/translate.c b/target/rx/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/translate.c
 +++ b/target/rx/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps rx_tr_ops = {
- #include "exec/translator.h"
+     .tb_stop            = rx_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void rx_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                       int *max_insns, vaddr pc, void *host_pc)
-+
+ {
- typedef struct DisasContext {
+     DisasContext dc;
-     DisasContextBase base;
-     CPURXState *env;
+diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ void cpu_get_tb_cpu_state(CPUS390XState *env, vaddr *pc,
  static const TCGCPUOps s390_tcg_ops = {
      .initialize = s390x_translate_init,
 +    .translate_code = s390x_translate_code,
      .restore_state_to_opc = s390x_restore_state_to_opc,
  #ifdef CONFIG_USER_ONLY
 diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/translate.c
 +++ b/target/s390x/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps s390x_tr_ops = {
- #include "exec/log.h"
+     .disas_log          = s390x_tr_disas_log,
- #include "qemu/atomic128.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                          int *max_insns, vaddr pc, void *host_pc)
+ {
- /* Information that (most) every instruction needs to manipulate.  */
+     DisasContext dc;
- typedef struct DisasContext DisasContext;
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sh4_sysemu_ops = {
  static const TCGCPUOps superh_tcg_ops = {
      .initialize = sh4_translate_init,
 +    .translate_code = sh4_translate_code,
      .synchronize_from_tb = superh_cpu_synchronize_from_tb,
      .restore_state_to_opc = superh_restore_state_to_opc,
 diff --git a/target/sh4/translate.c b/target/sh4/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/translate.c
 +++ b/target/sh4/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sh4_tr_ops = {
- #include "exec/log.h"
+     .tb_stop            = sh4_tr_tb_stop,
- #include "qemu/qemu-print.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                        int *max_insns, vaddr pc, void *host_pc)
+ {
- typedef struct DisasContext {
+     DisasContext ctx;
-     DisasContextBase base;
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sparc_sysemu_ops = {
  static const TCGCPUOps sparc_tcg_ops = {
      .initialize = sparc_tcg_init,
 +    .translate_code = sparc_translate_code,
      .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
      .restore_state_to_opc = sparc_restore_state_to_opc,
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/translate.c
 +++ b/target/sparc/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sparc_tr_ops = {
- #include "exec/log.h"
+     .tb_stop            = sparc_tr_tb_stop,
- #include "asi.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
++                          int *max_insns, vaddr pc, void *host_pc)
- #define DYNAMIC_PC  1 /* dynamic pc value */
+ {
- #define JUMP_PC     2 /* dynamic pc value which takes only two values
+     DisasContext dc = {};
 diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.c
 +++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps tricore_sysemu_ops = {
  static const TCGCPUOps tricore_tcg_ops = {
      .initialize = tricore_tcg_init,
 +    .translate_code = tricore_translate_code,
      .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
      .restore_state_to_opc = tricore_restore_state_to_opc,
      .tlb_fill = tricore_cpu_tlb_fill,
 diff --git a/target/tricore/translate.c b/target/tricore/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/translate.c
 +++ b/target/tricore/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps tricore_tr_ops = {
- #include "exec/translator.h"
+     .tb_stop            = tricore_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-
-+#include "exec/helper-info.c.inc"
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+#undef  HELPER_H
+-                           vaddr pc, void *host_pc)
-+
++void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
-+
++                            int *max_insns, vaddr pc, void *host_pc)
- /*
+ {
-  * TCG registers
+     DisasContext ctx;
-  */
+     translator_loop(cs, tb, max_insns, pc, host_pc,
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps xtensa_sysemu_ops = {
  static const TCGCPUOps xtensa_tcg_ops = {
      .initialize = xtensa_translate_init,
 +    .translate_code = xtensa_translate_code,
      .debug_excp_handler = xtensa_breakpoint_handler,
      .restore_state_to_opc = xtensa_restore_state_to_opc,
 diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/translate.c
 +++ b/target/xtensa/translate.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps xtensa_translator_ops = {
+     .tb_stop            = xtensa_tr_tb_stop,
- #include "exec/log.h"
+ };
-+#define HELPER_H "helper.h"
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-+#include "exec/helper-info.c.inc"
+-                           vaddr pc, void *host_pc)
-+#undef  HELPER_H
++void xtensa_translate_code(CPUState *cpu, TranslationBlock *tb,
-+
++                           int *max_insns, vaddr pc, void *host_pc)
+ {
- struct DisasContext {
+     DisasContext dc = {};
-     DisasContextBase base;
+     translator_loop(cpu, tb, max_insns, pc, host_pc,
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_pool_reset(TCGContext *s)
      s->pool_current = NULL;
  }
 -#include "exec/helper-proto.h"
 -
 -static TCGHelperInfo all_helpers[] = {
 -#include "exec/helper-tcg.h"
 -};
 -static GHashTable *helper_table;
 -
  /*
   * Create TCGHelperInfo structures for "tcg/tcg-ldst.h" functions,
   * akin to what "exec/helper-tcg.h" does with DEF_HELPER_FLAGS_N.
@@ -XXX,XX +XXX,XX @@ static ffi_type *typecode_to_ffi(int argmask)
      g_assert_not_reached();
  }
 -static void init_ffi_layouts(void)
 +static ffi_cif *init_ffi_layout(TCGHelperInfo *info)
  {
 -    /* g_direct_hash/equal for direct comparisons on uint32_t.  */
 -    GHashTable *ffi_table = g_hash_table_new(NULL, NULL);
 +    unsigned typemask = info->typemask;
 +    struct {
 +        ffi_cif cif;
 +        ffi_type *args[];
 +    } *ca;
 +    ffi_status status;
 +    int nargs;
 -    for (int i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
 -        TCGHelperInfo *info = &all_helpers[i];
 -        unsigned typemask = info->typemask;
 -        gpointer hash = (gpointer)(uintptr_t)typemask;
 -        struct {
 -            ffi_cif cif;
 -            ffi_type *args[];
 -        } *ca;
 -        ffi_status status;
 -        int nargs;
 -        ffi_cif *cif;
 +    /* Ignoring the return type, find the last non-zero field. */
 +    nargs = 32 - clz32(typemask >> 3);
 +    nargs = DIV_ROUND_UP(nargs, 3);
 +    assert(nargs <= MAX_CALL_IARGS);
 -        cif = g_hash_table_lookup(ffi_table, hash);
 -        if (cif) {
 -            info->cif = cif;
 -            continue;
 +    ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
 +    ca->cif.rtype = typecode_to_ffi(typemask & 7);
 +    ca->cif.nargs = nargs;
 +
 +    if (nargs != 0) {
 +        ca->cif.arg_types = ca->args;
 +        for (int j = 0; j < nargs; ++j) {
 +            int typecode = extract32(typemask, (j + 1) * 3, 3);
 +            ca->args[j] = typecode_to_ffi(typecode);
          }
 -
 -        /* Ignoring the return type, find the last non-zero field. */
 -        nargs = 32 - clz32(typemask >> 3);
 -        nargs = DIV_ROUND_UP(nargs, 3);
 -        assert(nargs <= MAX_CALL_IARGS);
 -
 -        ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
 -        ca->cif.rtype = typecode_to_ffi(typemask & 7);
 -        ca->cif.nargs = nargs;
 -
 -        if (nargs != 0) {
 -            ca->cif.arg_types = ca->args;
 -            for (int j = 0; j < nargs; ++j) {
 -                int typecode = extract32(typemask, (j + 1) * 3, 3);
 -                ca->args[j] = typecode_to_ffi(typecode);
 -            }
 -        }
 -
 -        status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
 -                              ca->cif.rtype, ca->cif.arg_types);
 -        assert(status == FFI_OK);
 -
 -        cif = &ca->cif;
 -        info->cif = cif;
 -        g_hash_table_insert(ffi_table, hash, (gpointer)cif);
      }
 -    g_hash_table_destroy(ffi_table);
 +    status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
 +                          ca->cif.rtype, ca->cif.arg_types);
 +    assert(status == FFI_OK);
 +
 +    return &ca->cif;
  }
 +
 +#define HELPER_INFO_INIT(I)      (&(I)->cif)
 +#define HELPER_INFO_INIT_VAL(I)  init_ffi_layout(I)
 +#else
 +#define HELPER_INFO_INIT(I)      (&(I)->init)
 +#define HELPER_INFO_INIT_VAL(I)  1
  #endif /* CONFIG_TCG_INTERPRETER */
  static inline bool arg_slot_reg_p(unsigned arg_slot)
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
          args_ct += n;
      }
 -    /* Register helpers.  */
 -    /* Use g_direct_hash/equal for direct pointer comparisons on func.  */
 -    helper_table = g_hash_table_new(NULL, NULL);
 -
 -    for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
 -        init_call_layout(&all_helpers[i]);
 -        g_hash_table_insert(helper_table, (gpointer)all_helpers[i].func,
 -                            (gpointer)&all_helpers[i]);
 -    }
 -
      init_call_layout(&info_helper_ld32_mmu);
      init_call_layout(&info_helper_ld64_mmu);
      init_call_layout(&info_helper_ld128_mmu);
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
      init_call_layout(&info_helper_st64_mmu);
      init_call_layout(&info_helper_st128_mmu);
 -#ifdef CONFIG_TCG_INTERPRETER
 -    init_ffi_layouts();
 -#endif
 -
      tcg_target_init(s);
      process_op_defs(s);
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
  static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs);
 -void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 +void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, int nargs, TCGTemp **args)
  {
 -    const TCGHelperInfo *info;
      TCGv_i64 extend_free[MAX_CALL_IARGS];
      int n_extend = 0;
      TCGOp *op;
      int i, n, pi = 0, total_args;
 -    info = g_hash_table_lookup(helper_table, (gpointer)func);
 +    if (unlikely(g_once_init_enter(HELPER_INFO_INIT(info)))) {
 +        init_call_layout(info);
 +        g_once_init_leave(HELPER_INFO_INIT(info), HELPER_INFO_INIT_VAL(info));
 +    }
 +
      total_args = info->nr_out + info->nr_in + 2;
      op = tcg_op_alloc(INDEX_op_call, total_args);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
              g_assert_not_reached();
          }
      }
 -    op->args[pi++] = (uintptr_t)func;
 +    op->args[pi++] = (uintptr_t)info->func;
      op->args[pi++] = (uintptr_t)info;
      tcg_debug_assert(pi == total_args);
 diff --git a/include/exec/helper-info.c.inc b/include/exec/helper-info.c.inc
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/include/exec/helper-info.c.inc
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Helper file for declaring TCG helper functions.
 + * This one expands info structures for tcg helpers.
 + * Define HELPER_H for the header file to be expanded.
 + */
 +
 +#include "tcg/tcg.h"
 +#include "tcg/helper-info.h"
 +#include "exec/helper-head.h"
 +
 +/*
 + * Need one more level of indirection before stringification
 + * to get all the macros expanded first.
 + */
 +#define str(s) #s
 +
 +#define DEF_HELPER_FLAGS_0(NAME, FLAGS, RET)                            \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0)                                 \
 +    };
 +
 +#define DEF_HELPER_FLAGS_1(NAME, FLAGS, RET, T1)                        \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +    };
 +
 +#define DEF_HELPER_FLAGS_2(NAME, FLAGS, RET, T1, T2)                    \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +                  | dh_typemask(T2, 2)                                  \
 +    };
 +
 +#define DEF_HELPER_FLAGS_3(NAME, FLAGS, RET, T1, T2, T3)                \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
 +    };
 +
 +#define DEF_HELPER_FLAGS_4(NAME, FLAGS, RET, T1, T2, T3, T4)            \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
 +                  | dh_typemask(T4, 4)                                  \
 +    };
 +
 +#define DEF_HELPER_FLAGS_5(NAME, FLAGS, RET, T1, T2, T3, T4, T5)        \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
 +                  | dh_typemask(T4, 4) | dh_typemask(T5, 5)             \
 +    };
 +
 +#define DEF_HELPER_FLAGS_6(NAME, FLAGS, RET, T1, T2, T3, T4, T5, T6)    \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
 +                  | dh_typemask(T4, 4) | dh_typemask(T5, 5)             \
 +                  | dh_typemask(T6, 6)                                  \
 +    };
 +
 +#define DEF_HELPER_FLAGS_7(NAME, FLAGS, RET, T1, T2, T3, T4, T5, T6, T7) \
 +    TCGHelperInfo glue(helper_info_, NAME) = {                          \
 +        .func = HELPER(NAME), .name = str(NAME),                        \
 +        .flags = FLAGS | dh_callflag(RET),                              \
 +        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
 +                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
 +                  | dh_typemask(T4, 4) | dh_typemask(T5, 5)             \
 +                  | dh_typemask(T6, 6) | dh_typemask(T7, 7)             \
 +    };
 +
 +#include HELPER_H
 +
 +#undef str
 +#undef DEF_HELPER_FLAGS_0
 +#undef DEF_HELPER_FLAGS_1
 +#undef DEF_HELPER_FLAGS_2
 +#undef DEF_HELPER_FLAGS_3
 +#undef DEF_HELPER_FLAGS_4
 +#undef DEF_HELPER_FLAGS_5
 +#undef DEF_HELPER_FLAGS_6
 +#undef DEF_HELPER_FLAGS_7
 --
-.34.1
+.43.0

The following changes since commit b52daaf2c868f2bab102eb5acbf55b2917f46aea:

Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging (2023-06-05 10:27:31 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230605

for you to fetch changes up to a7f6911c127b1dd1b8764e03b0ebcf0a227a15e4:

tcg/tcg-op-vec: Remove left over _link_error() definitions (2023-06-05 12:20:16 -0700)

----------------------------------------------------------------
Build tcg/ once for system and once for user.
Unmap perf_marker.
Remove left over _link_error() definitions.

----------------------------------------------------------------
Ilya Leoshkevich (1):
      accel/tcg: Unmap perf_marker

Philippe Mathieu-Daudé (2):
      target/ppc: Inline gen_icount_io_start()
      tcg/tcg-op-vec: Remove left over _link_error() definitions

Richard Henderson (49):
      tcg/ppc: Remove TARGET_LONG_BITS, TCG_TYPE_TL
      tcg/riscv: Remove TARGET_LONG_BITS, TCG_TYPE_TL
      tcg/s390x: Remove TARGET_LONG_BITS, TCG_TYPE_TL
      tcg/sparc64: Remove TARGET_LONG_BITS, TCG_TYPE_TL
      tcg: Move TCG_TYPE_TL from tcg.h to tcg-op.h
      tcg: Widen CPUTLBEntry comparators to 64-bits
      tcg: Add tlb_fast_offset to TCGContext
      target/avr: Add missing includes of qemu/error-report.h
      target/*: Add missing includes of tcg/debug-assert.h
      *: Add missing includes of tcg/tcg.h
      tcg: Split out tcg-target-reg-bits.h
      target/arm: Fix test of TCG_OVERSIZED_GUEST
      tcg: Split out tcg/oversized-guest.h
      tcg: Move TCGv, dup_const_tl definitions to tcg-op.h
      tcg: Split tcg/tcg-op-common.h from tcg/tcg-op.h
      target/arm: Include helper-gen.h in translator.h
      target/hexagon: Include helper-gen.h where needed
      tcg: Remove outdated comments in helper-head.h
      tcg: Move TCGHelperInfo and dependencies to tcg/helper-info.h
      tcg: Pass TCGHelperInfo to tcg_gen_callN
      tcg: Move temp_idx and tcgv_i32_temp debug out of line
      tcg: Split tcg_gen_callN
      tcg: Split helper-gen.h
      tcg: Split helper-proto.h
      target/sh4: Emit insn_start for each insn in gUSA region
      tcg: Add insn_start_words to TCGContext
      tcg: Add guest_mo to TCGContext
      tcg: Move TLB_FLAGS_MASK check out of get_alignment_bits
      tcg: Split tcg/tcg-op-gvec.h
      tcg: Remove NO_CPU_IO_DEFS
      exec-all: Widen tb_page_addr_t for user-only
      exec-all: Widen TranslationBlock pc and cs_base to 64-bits
      tcg: Spit out exec/translation-block.h
      include/exec: Remove CODE_GEN_AVG_BLOCK_SIZE
      accel/tcg: Move most of gen-icount.h into translator.c
      accel/tcg: Introduce translator_io_start
      accel/tcg: Move translator_fake_ldb out of line
      target/arm: Tidy helpers for translation
      target/mips: Tidy helpers for translation
      target/*: Add missing includes of exec/translation-block.h
      target/arm: Add missing include of exec/exec-all.h
      accel/tcg: Tidy includes for translator.[ch]
      tcg: Fix PAGE/PROT confusion
      tcg: Move env defines out of NEED_CPU_H in helper-head.h
      tcg: Remove target-specific headers from tcg.[ch]
      plugins: Move plugin_insn_append to translator.c
      plugins: Drop unused headers from exec/plugin-gen.h
      exec/poison: Do not poison CONFIG_SOFTMMU
      tcg: Build once for system and once for user-only

MAINTAINERS                                        |    3 +-
 include/exec/cpu-all.h                             |    3 +
 include/exec/cpu-defs.h                            |   50 +-
 include/exec/cpu_ldst.h                            |   22 +-
 include/exec/exec-all.h                            |  142 +--
 include/exec/gen-icount.h                          |   83 --
 include/exec/helper-gen-common.h                   |   18 +
 include/exec/helper-gen.h                          |   97 +-
 include/exec/helper-head.h                         |   24 +-
 include/exec/helper-proto-common.h                 |   18 +
 include/exec/helper-proto.h                        |   73 +-
 include/exec/helper-tcg.h                          |   75 --
 include/exec/plugin-gen.h                          |   24 -
 include/exec/poison.h                              |    1 -
 include/exec/tlb-common.h                          |   56 ++
 include/exec/translation-block.h                   |  149 +++
 include/exec/translator.h                          |   24 +-
 include/qemu/typedefs.h                            |    1 +
 include/tcg/helper-info.h                          |   64 ++
 include/tcg/insn-start-words.h                     |   17 +
 include/tcg/oversized-guest.h                      |   23 +
 include/tcg/tcg-op-common.h                        |  996 +++++++++++++++++++
 include/tcg/tcg-op-gvec-common.h                   |  426 ++++++++
 include/tcg/tcg-op-gvec.h                          |  444 +--------
 include/tcg/tcg-op.h                               | 1033 +-------------------
 include/tcg/tcg-opc.h                              |    6 +-
 include/tcg/tcg.h                                  |  107 +-
 target/arm/cpregs.h                                |    4 +-
 target/arm/tcg/translate.h                         |    5 +
 target/mips/tcg/translate.h                        |    5 +-
 target/ppc/cpu.h                                   |    2 -
 target/sparc/cpu.h                                 |    2 -
 tcg/aarch64/tcg-target-reg-bits.h                  |   12 +
 tcg/arm/tcg-target-reg-bits.h                      |   12 +
 tcg/i386/tcg-target-reg-bits.h                     |   16 +
 tcg/i386/tcg-target.h                              |    2 -
 tcg/loongarch64/tcg-target-reg-bits.h              |   21 +
 tcg/loongarch64/tcg-target.h                       |   11 -
 tcg/mips/tcg-target-reg-bits.h                     |   18 +
 tcg/mips/tcg-target.h                              |    8 -
 tcg/ppc/tcg-target-reg-bits.h                      |   16 +
 tcg/ppc/tcg-target.h                               |    5 -
 tcg/riscv/tcg-target-reg-bits.h                    |   19 +
 tcg/riscv/tcg-target.h                             |    9 -
 tcg/s390x/tcg-target-reg-bits.h                    |   17 +
 tcg/sparc64/tcg-target-reg-bits.h                  |   12 +
 tcg/tcg-internal.h                                 |   47 +-
 tcg/tci/tcg-target-reg-bits.h                      |   18 +
 tcg/tci/tcg-target.h                               |    8 -
 include/exec/helper-gen.h.inc                      |  102 ++
 include/exec/helper-proto.h.inc                    |   68 ++
 accel/tcg/cpu-exec.c                               |    2 +-
 accel/tcg/cputlb.c                                 |   12 +-
 accel/tcg/monitor.c                                |    1 +
 accel/tcg/perf.c                                   |   19 +-
 accel/tcg/plugin-gen.c                             |    6 +
 accel/tcg/tcg-accel-ops-mttcg.c                    |    2 +-
 accel/tcg/tcg-accel-ops-rr.c                       |    2 +-
 accel/tcg/tcg-all.c                                |    1 +
 accel/tcg/tcg-runtime-gvec.c                       |    2 +-
 accel/tcg/tcg-runtime.c                            |    6 +-
 accel/tcg/translate-all.c                          |   30 +-
 accel/tcg/translator.c                             |  140 ++-
 target/alpha/translate.c                           |   18 +-
 target/arm/ptw.c                                   |    8 +-
 target/arm/tcg/translate-a64.c                     |   42 +-
 target/arm/tcg/translate-m-nocp.c                  |    2 -
 target/arm/tcg/translate-mve.c                     |    4 -
 target/arm/tcg/translate-neon.c                    |    4 -
 target/arm/tcg/translate-sme.c                     |    7 -
 target/arm/tcg/translate-sve.c                     |   11 -
 target/arm/tcg/translate-vfp.c                     |    7 +-
 target/arm/tcg/translate.c                         |   41 +-
 target/avr/cpu.c                                   |    1 +
 target/avr/helper.c                                |    1 +
 target/avr/translate.c                             |    6 +-
 target/cris/translate.c                            |    8 +-
 target/hexagon/genptr.c                            |    1 +
 target/hexagon/translate.c                         |    7 +
 target/hppa/translate.c                            |   10 +-
 target/i386/helper.c                               |    3 +
 target/i386/tcg/translate.c                        |   57 +-
 target/loongarch/translate.c                       |    7 +-
 target/m68k/translate.c                            |    5 +-
 target/microblaze/translate.c                      |    6 +-
 target/mips/tcg/msa_translate.c                    |    3 -
 target/mips/tcg/mxu_translate.c                    |    2 -
 target/mips/tcg/octeon_translate.c                 |    4 +-
 target/mips/tcg/rel6_translate.c                   |    2 -
 target/mips/tcg/translate.c                        |   53 +-
 target/mips/tcg/translate_addr_const.c             |    1 -
 target/mips/tcg/tx79_translate.c                   |    4 +-
 target/mips/tcg/vr54xx_translate.c                 |    3 -
 target/nios2/translate.c                           |    6 +-
 target/openrisc/sys_helper.c                       |    1 +
 target/openrisc/translate.c                        |   14 +-
 target/ppc/translate.c                             |   78 +-
 target/riscv/cpu_helper.c                          |    1 +
 target/riscv/translate.c                           |    6 +-
 target/rx/cpu.c                                    |    1 +
 target/rx/op_helper.c                              |    1 +
 target/rx/translate.c                              |    7 +-
 target/s390x/tcg/translate.c                       |   10 +-
 target/sh4/translate.c                             |   21 +-
 target/sparc/translate.c                           |   78 +-
 target/tricore/cpu.c                               |    1 +
 target/tricore/translate.c                         |    7 +-
 target/xtensa/translate.c                          |   31 +-
 tcg/optimize.c                                     |    2 +-
 tcg/region.c                                       |   20 +-
 tcg/tcg-op-gvec.c                                  |    4 +-
 tcg/tcg-op-ldst.c                                  |   26 +-
 tcg/tcg-op-vec.c                                   |   13 +-
 tcg/tcg-op.c                                       |    4 +-
 tcg/tcg.c                                          |  218 +++--
 tcg/tci.c                                          |    3 +-
 include/exec/helper-info.c.inc                     |   96 ++
 target/loongarch/insn_trans/trans_extra.c.inc      |    4 +-
 target/loongarch/insn_trans/trans_privileged.c.inc |    4 +-
 target/ppc/power8-pmu-regs.c.inc                   |   10 +-
 target/ppc/translate/branch-impl.c.inc             |    2 +-
 target/riscv/insn_trans/trans_privileged.c.inc     |    8 +-
 target/riscv/insn_trans/trans_rvi.c.inc            |   24 +-
 tcg/aarch64/tcg-target.c.inc                       |    8 +-
 tcg/arm/tcg-target.c.inc                           |    8 +-
 tcg/i386/tcg-target.c.inc                          |    9 +-
 tcg/loongarch64/tcg-target.c.inc                   |    8 +-
 tcg/mips/tcg-target.c.inc                          |   20 +-
 tcg/ppc/tcg-target.c.inc                           |   46 +-
 tcg/riscv/tcg-target.c.inc                         |   21 +-
 tcg/s390x/tcg-target.c.inc                         |   22 +-
 tcg/sparc64/tcg-target.c.inc                       |   20 +-
 scripts/make-config-poison.sh                      |    5 +-
 target/hexagon/idef-parser/idef-parser.y           |    3 +-
 tcg/meson.build                                    |   30 +-
 135 files changed, 3088 insertions(+), 2782 deletions(-)
 delete mode 100644 include/exec/gen-icount.h
 create mode 100644 include/exec/helper-gen-common.h
 create mode 100644 include/exec/helper-proto-common.h
 delete mode 100644 include/exec/helper-tcg.h
 create mode 100644 include/exec/tlb-common.h
 create mode 100644 include/exec/translation-block.h
 create mode 100644 include/tcg/helper-info.h
 create mode 100644 include/tcg/insn-start-words.h
 create mode 100644 include/tcg/oversized-guest.h
 create mode 100644 include/tcg/tcg-op-common.h
 create mode 100644 include/tcg/tcg-op-gvec-common.h
 create mode 100644 tcg/aarch64/tcg-target-reg-bits.h
 create mode 100644 tcg/arm/tcg-target-reg-bits.h
 create mode 100644 tcg/i386/tcg-target-reg-bits.h
 create mode 100644 tcg/loongarch64/tcg-target-reg-bits.h
 create mode 100644 tcg/mips/tcg-target-reg-bits.h
 create mode 100644 tcg/ppc/tcg-target-reg-bits.h
 create mode 100644 tcg/riscv/tcg-target-reg-bits.h
 create mode 100644 tcg/s390x/tcg-target-reg-bits.h
 create mode 100644 tcg/sparc64/tcg-target-reg-bits.h
 create mode 100644 tcg/tci/tcg-target-reg-bits.h
 create mode 100644 include/exec/helper-gen.h.inc
 create mode 100644 include/exec/helper-proto.h.inc
 create mode 100644 include/exec/helper-info.c.inc

All uses replaced with TCGContext.addr_type.

Reviewed-by: Anton Johansson <anjo@rev.ng>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                                            TCGReg addrlo, TCGReg addrhi,
                                            MemOpIdx oi, bool is_ld)
 {
+    TCGType addr_type = s->addr_type;
     TCGLabelQemuLdst *ldst = NULL;
     MemOp opc = get_memop(oi);
     MemOp a_bits, s_bits;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0));
 
     /* Load the (low part) TLB comparator into TMP2.  */
-    if (cmp_off == 0 && TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
-        uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32
+    if (cmp_off == 0
+        && (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32)) {
+        uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32
                         ? LWZUX : LDUX);
         tcg_out32(s, lxu | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
     } else {
         tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
-        if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+        if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) {
             tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2,
                        TCG_REG_TMP1, cmp_off + 4 * HOST_BIG_ENDIAN);
         } else {
-            tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
+            tcg_out_ld(s, addr_type, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      * Load the TLB addend for use on the fast path.
      * Do this asap to minimize any load use delay.
      */
-    if (TCG_TARGET_REG_BITS >= TARGET_LONG_BITS) {
+    if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) {
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
                    offsetof(CPUTLBEntry, addend));
     }
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         }
 
         /* Mask the address for the requested alignment.  */
-        if (TARGET_LONG_BITS == 32) {
+        if (addr_type == TCG_TYPE_I32) {
             tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0,
                         (32 - a_bits) & 31, 31 - s->page_bits);
         } else if (a_bits == 0) {
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         }
     }
 
-    if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+    if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) {
         /* Low part comparison into cr7. */
         tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2,
                     0, 7, TCG_TYPE_I32);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         tcg_out32(s, CRAND | BT(7, CR_EQ) | BA(6, CR_EQ) | BB(7, CR_EQ));
     } else {
         /* Full comparison into cr7. */
-        tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2,
-                    0, 7, TCG_TYPE_TL);
+        tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2, 0, 7, addr_type);
     }
 
     /* Load a pointer into the current opcode w/conditional branch-link. */
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     h->base = guest_base ? TCG_GUEST_BASE_REG : 0;
 #endif
 
-    if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
+    if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) {
         /* Zero-extend the guest address for use in the host address. */
         tcg_out_ext32u(s, TCG_REG_R0, addrlo);
         h->index = TCG_REG_R0;
-- 
2.34.1

All uses replaced with TCGContext.addr_type.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
                                            TCGReg addr_reg, MemOpIdx oi,
                                            bool is_ld)
 {
+    TCGType addr_type = s->addr_type;
     TCGLabelQemuLdst *ldst = NULL;
     MemOp opc = get_memop(oi);
     TCGAtomAlign aa;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
     addr_adj = addr_reg;
     if (a_mask < s_mask) {
         addr_adj = TCG_REG_TMP0;
-        tcg_out_opc_imm(s, TARGET_LONG_BITS == 32 ? OPC_ADDIW : OPC_ADDI,
+        tcg_out_opc_imm(s, addr_type == TCG_TYPE_I32 ? OPC_ADDIW : OPC_ADDI,
                         addr_adj, addr_reg, s_mask - a_mask);
     }
     compare_mask = s->page_mask | a_mask;
     if (compare_mask == sextreg(compare_mask, 0, 12)) {
         tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_adj, compare_mask);
     } else {
-        tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_TMP1, compare_mask);
+        tcg_out_movi(s, addr_type, TCG_REG_TMP1, compare_mask);
         tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_adj);
     }
 
     /* Load the tlb comparator and the addend.  */
-    tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_TMP0, TCG_REG_TMP2,
+    tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2,
                is_ld ? offsetof(CPUTLBEntry, addr_read)
                      : offsetof(CPUTLBEntry, addr_write));
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2,
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
     tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0);
 
     /* TLB Hit - translate address using addend.  */
-    if (TARGET_LONG_BITS == 64) {
+    if (addr_type != TCG_TYPE_I32) {
         tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2);
     } else if (have_zba) {
         tcg_out_opc_reg(s, OPC_ADD_UW, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
 
     if (guest_base != 0) {
         base = TCG_REG_TMP0;
-        if (TARGET_LONG_BITS == 64) {
+        if (addr_type != TCG_TYPE_I32) {
             tcg_out_opc_reg(s, OPC_ADD, base, addr_reg, TCG_GUEST_BASE_REG);
         } else if (have_zba) {
             tcg_out_opc_reg(s, OPC_ADD_UW, base, addr_reg, TCG_GUEST_BASE_REG);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
             tcg_out_ext32u(s, base, addr_reg);
             tcg_out_opc_reg(s, OPC_ADD, base, base, TCG_GUEST_BASE_REG);
         }
-    } else if (TARGET_LONG_BITS == 64) {
+    } else if (addr_type != TCG_TYPE_I32) {
         base = addr_reg;
     } else {
         base = TCG_REG_TMP0;
-- 
2.34.1

All uses replaced with TCGContext.addr_type.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target.c.inc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                                            TCGReg addr_reg, MemOpIdx oi,
                                            bool is_ld)
 {
+    TCGType addr_type = s->addr_type;
     TCGLabelQemuLdst *ldst = NULL;
     MemOp opc = get_memop(oi);
     MemOp s_bits = opc & MO_SIZE;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask);
     } else {
         tcg_out_insn(s, RX, LA, TCG_REG_R0, addr_reg, TCG_REG_NONE, a_off);
-        tgen_andi(s, TCG_TYPE_TL, TCG_REG_R0, tlb_mask);
+        tgen_andi(s, addr_type, TCG_REG_R0, tlb_mask);
     }
 
     if (is_ld) {
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     } else {
         ofs = offsetof(CPUTLBEntry, addr_write);
     }
-    if (TARGET_LONG_BITS == 32) {
+    if (addr_type == TCG_TYPE_I32) {
         tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
     } else {
         tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     tcg_out_insn(s, RXY, LG, h->index, TCG_TMP0, TCG_REG_NONE,
                  offsetof(CPUTLBEntry, addend));
 
-    if (TARGET_LONG_BITS == 32) {
+    if (addr_type == TCG_TYPE_I32) {
         tcg_out_insn(s, RRE, ALGFR, h->index, addr_reg);
         h->base = TCG_REG_NONE;
     } else {
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     }
 
     h->base = addr_reg;
-    if (TARGET_LONG_BITS == 32) {
+    if (addr_type == TCG_TYPE_I32) {
         tcg_out_ext32u(s, TCG_TMP0, addr_reg);
         h->base = TCG_TMP0;
     }
-- 
2.34.1

All uses replaced with TCGContext.addr_type.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc64/tcg-target.c.inc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                                            TCGReg addr_reg, MemOpIdx oi,
                                            bool is_ld)
 {
+    TCGType addr_type = s->addr_type;
     TCGLabelQemuLdst *ldst = NULL;
     MemOp opc = get_memop(oi);
     MemOp s_bits = opc & MO_SIZE;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T3, ARITH_ADD);
 
     /* Load the tlb comparator and the addend. */
-    tcg_out_ld(s, TCG_TYPE_TL, TCG_REG_T2, TCG_REG_T1, cmp_off);
+    tcg_out_ld(s, addr_type, TCG_REG_T2, TCG_REG_T1, cmp_off);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T1, TCG_REG_T1, add_off);
     h->base = TCG_REG_T1;
 
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     ldst->label_ptr[0] = s->code_ptr;
 
     /* bne,pn %[xi]cc, label0 */
-    cc = TARGET_LONG_BITS == 64 ? BPCC_XCC : BPCC_ICC;
+    cc = addr_type == TCG_TYPE_I32 ? BPCC_ICC : BPCC_XCC;
     tcg_out_bpcc0(s, COND_NE, BPCC_PN | cc, 0);
 #else
     /*
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 #endif
 
     /* If the guest address must be zero-extended, do in the delay slot.  */
-    if (TARGET_LONG_BITS == 32) {
+    if (addr_type == TCG_TYPE_I32) {
         tcg_out_ext32u(s, TCG_REG_T2, addr_reg);
         h->index = TCG_REG_T2;
     } else {
-- 
2.34.1

Removes the only use of TARGET_LONG_BITS from tcg.h, which is to be
target independent.  Move the symbol to a define in tcg-op.h, which
will continue to be target dependent.  Rather than complicate matters
for the use in tb_gen_code(), expand the definition there.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h      | 8 ++++++++
 include/tcg/tcg.h         | 7 -------
 accel/tcg/translate-all.c | 2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
 #error must include QEMU headers
 #endif
 
+#if TARGET_LONG_BITS == 32
+# define TCG_TYPE_TL  TCG_TYPE_I32
+#elif TARGET_LONG_BITS == 64
+# define TCG_TYPE_TL  TCG_TYPE_I64
+#else
+# error
+#endif
+
 #if TARGET_INSN_START_WORDS == 1
 static inline void tcg_gen_insn_start(target_ulong pc)
 {
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGType {
 #else
     TCG_TYPE_PTR = TCG_TYPE_I64,
 #endif
-
-    /* An alias for the size of the target "long", aka register.  */
-#if TARGET_LONG_BITS == 64
-    TCG_TYPE_TL = TCG_TYPE_I64,
-#else
-    TCG_TYPE_TL = TCG_TYPE_I32,
-#endif
 } TCGType;
 
 /**
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tb_set_page_addr0(tb, phys_pc);
     tb_set_page_addr1(tb, -1);
     tcg_ctx->gen_tb = tb;
-    tcg_ctx->addr_type = TCG_TYPE_TL;
+    tcg_ctx->addr_type = TARGET_LONG_BITS == 32 ? TCG_TYPE_I32 : TCG_TYPE_I64;
 #ifdef CONFIG_SOFTMMU
     tcg_ctx->page_bits = TARGET_PAGE_BITS;
     tcg_ctx->page_mask = TARGET_PAGE_MASK;
-- 
2.34.1

This makes CPUTLBEntry agnostic to the address size of the guest.
When 32-bit addresses are in effect, we can simply read the low
32 bits of the 64-bit field.  Similarly when we need to update
the field for setting TLB_NOTDIRTY.

For TCG backends that could in theory be big-endian, but in
practice are not (arm, loongarch, riscv), use QEMU_BUILD_BUG_ON
to document and ensure this is not accidentally missed.

For s390x, which is always big-endian, use HOST_BIG_ENDIAN anyway,
to document the reason for the adjustment.

For sparc64 and ppc64, always perform a 64-bit load, and rely on
the following 32-bit comparison to ignore the high bits.

Rearrange mips and ppc if ladders for clarity.

Reviewed-by: Anton Johansson <anjo@rev.ng>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-defs.h          | 37 +++++++++++---------------------
 include/exec/cpu_ldst.h          | 19 ++++++++++------
 accel/tcg/cputlb.c               |  8 +++++--
 tcg/aarch64/tcg-target.c.inc     |  1 +
 tcg/arm/tcg-target.c.inc         |  1 +
 tcg/loongarch64/tcg-target.c.inc |  1 +
 tcg/mips/tcg-target.c.inc        | 13 ++++++-----
 tcg/ppc/tcg-target.c.inc         | 28 +++++++++++++-----------
 tcg/riscv/tcg-target.c.inc       |  1 +
 tcg/s390x/tcg-target.c.inc       |  1 +
 tcg/sparc64/tcg-target.c.inc     |  8 +++++--
 11 files changed, 67 insertions(+), 51 deletions(-)

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@
 /* use a fully associative victim tlb of 8 entries */
 #define CPU_VTLB_SIZE 8
 
-#if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
-#define CPU_TLB_ENTRY_BITS 4
-#else
 #define CPU_TLB_ENTRY_BITS 5
-#endif
 
 #define CPU_TLB_DYN_MIN_BITS 6
 #define CPU_TLB_DYN_DEFAULT_BITS 8
@@ -XXX,XX +XXX,XX @@
 # endif
 
 /* Minimalized TLB entry for use by TCG fast path. */
-typedef struct CPUTLBEntry {
-    /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
-       bit TARGET_PAGE_BITS-1..4  : Nonzero for accesses that should not
-                                    go directly to ram.
-       bit 3                      : indicates that the entry is invalid
-       bit 2..0                   : zero
-    */
-    union {
-        struct {
-            target_ulong addr_read;
-            target_ulong addr_write;
-            target_ulong addr_code;
-            /* Addend to virtual address to get host address.  IO accesses
-               use the corresponding iotlb value.  */
-            uintptr_t addend;
-        };
+typedef union CPUTLBEntry {
+    struct {
+        uint64_t addr_read;
+        uint64_t addr_write;
+        uint64_t addr_code;
         /*
-         * Padding to get a power of two size, as well as index
-         * access to addr_{read,write,code}.
+         * Addend to virtual address to get host address.  IO accesses
+         * use the corresponding iotlb value.
          */
-        target_ulong addr_idx[(1 << CPU_TLB_ENTRY_BITS) / TARGET_LONG_SIZE];
+        uintptr_t addend;
     };
+    /*
+     * Padding to get a power of two size, as well as index
+     * access to addr_{read,write,code}.
+     */
+    uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
 } CPUTLBEntry;
 
 QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
 
-
 #endif  /* !CONFIG_USER_ONLY && CONFIG_TCG */
 
 #if !defined(CONFIG_USER_ONLY)
diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline target_ulong tlb_read_idx(const CPUTLBEntry *entry,
 {
     /* Do not rearrange the CPUTLBEntry structure members. */
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_read) !=
-                      MMU_DATA_LOAD * TARGET_LONG_SIZE);
+                      MMU_DATA_LOAD * sizeof(uint64_t));
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) !=
-                      MMU_DATA_STORE * TARGET_LONG_SIZE);
+                      MMU_DATA_STORE * sizeof(uint64_t));
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) !=
-                      MMU_INST_FETCH * TARGET_LONG_SIZE);
+                      MMU_INST_FETCH * sizeof(uint64_t));
 
-    const target_ulong *ptr = &entry->addr_idx[access_type];
-#if TCG_OVERSIZED_GUEST
-    return *ptr;
+#if TARGET_LONG_BITS == 32
+    /* Use qatomic_read, in case of addr_write; only care about low bits. */
+    const uint32_t *ptr = (uint32_t *)&entry->addr_idx[access_type];
+    ptr += HOST_BIG_ENDIAN;
+    return qatomic_read(ptr);
 #else
+    const uint64_t *ptr = &entry->addr_idx[access_type];
+# if TCG_OVERSIZED_GUEST
+    return *ptr;
+# else
     /* ofs might correspond to .addr_write, so use qatomic_read */
     return qatomic_read(ptr);
+# endif
 #endif
 }
 
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
         addr &= TARGET_PAGE_MASK;
         addr += tlb_entry->addend;
         if ((addr - start) < length) {
-#if TCG_OVERSIZED_GUEST
+#if TARGET_LONG_BITS == 32
+            uint32_t *ptr_write = (uint32_t *)&tlb_entry->addr_write;
+            ptr_write += HOST_BIG_ENDIAN;
+            qatomic_set(ptr_write, *ptr_write | TLB_NOTDIRTY);
+#elif TCG_OVERSIZED_GUEST
             tlb_entry->addr_write |= TLB_NOTDIRTY;
 #else
             qatomic_set(&tlb_entry->addr_write,
-                       tlb_entry->addr_write | TLB_NOTDIRTY);
+                        tlb_entry->addr_write | TLB_NOTDIRTY);
 #endif
         }
     }
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);
 
     /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */
+    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
     tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
                is_ld ? offsetof(CPUTLBEntry, addr_read)
                      : offsetof(CPUTLBEntry, addr_write));
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
      * Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
      * Load the tlb comparator into R2/R3 and the fast path addend into R1.
      */
+    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
     if (cmp_off == 0) {
         if (s->addr_type == TCG_TYPE_I32) {
             tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     tcg_out_opc_add_d(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1);
 
     /* Load the tlb comparator and the addend.  */
+    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
     tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2,
                is_ld ? offsetof(CPUTLBEntry, addr_read)
                      : offsetof(CPUTLBEntry, addr_write));
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     /* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3.  */
     tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1);
 
+    if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) {
+        /* Load the (low half) tlb comparator.  */
+        tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3,
+                   cmp_off + HOST_BIG_ENDIAN * 4);
+    } else {
+        tcg_out_ld(s, TCG_TYPE_I64, TCG_TMP0, TCG_TMP3, cmp_off);
+    }
+
     if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) {
-        /* Load the tlb comparator.  */
-        tcg_out_ld(s, addr_type, TCG_TMP0, TCG_TMP3, cmp_off);
         /* Load the tlb addend for the fast path.  */
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off);
-    } else {
-        /* Load the low half of the tlb comparator.  */
-        tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + LO_OFF);
     }
 
     /*
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     }
     tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0));
 
-    /* Load the (low part) TLB comparator into TMP2.  */
-    if (cmp_off == 0
-        && (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32)) {
-        uint32_t lxu = (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32
-                        ? LWZUX : LDUX);
-        tcg_out32(s, lxu | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
+    /*
+     * Load the (low part) TLB comparator into TMP2.
+     * For 64-bit host, always load the entire 64-bit slot for simplicity.
+     * We will ignore the high bits with tcg_out_cmp(..., addr_type).
+     */
+    if (TCG_TARGET_REG_BITS == 64) {
+        if (cmp_off == 0) {
+            tcg_out32(s, LDUX | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
+        } else {
+            tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
+            tcg_out_ld(s, TCG_TYPE_I64, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
+        }
+    } else if (cmp_off == 0 && !HOST_BIG_ENDIAN) {
+        tcg_out32(s, LWZUX | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2));
     } else {
         tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2));
-        if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) {
-            tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2,
-                       TCG_REG_TMP1, cmp_off + 4 * HOST_BIG_ENDIAN);
-        } else {
-            tcg_out_ld(s, addr_type, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off);
-        }
+        tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2, TCG_REG_TMP1,
+                   cmp_off + 4 * HOST_BIG_ENDIAN);
     }
 
     /*
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
     }
 
     /* Load the tlb comparator and the addend.  */
+    QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
     tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2,
                is_ld ? offsetof(CPUTLBEntry, addr_read)
                      : offsetof(CPUTLBEntry, addr_write));
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
         ofs = offsetof(CPUTLBEntry, addr_write);
     }
     if (addr_type == TCG_TYPE_I32) {
+        ofs += HOST_BIG_ENDIAN * 4;
         tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
     } else {
         tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs);
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     /* Add the tlb_table pointer, creating the CPUTLBEntry address into R2.  */
     tcg_out_arith(s, TCG_REG_T1, TCG_REG_T1, TCG_REG_T3, ARITH_ADD);
 
-    /* Load the tlb comparator and the addend. */
-    tcg_out_ld(s, addr_type, TCG_REG_T2, TCG_REG_T1, cmp_off);
+    /*
+     * Load the tlb comparator and the addend.
+     * Always load the entire 64-bit comparator for simplicity.
+     * We will ignore the high bits via BPCC_ICC below.
+     */
+    tcg_out_ld(s, TCG_TYPE_I64, TCG_REG_T2, TCG_REG_T1, cmp_off);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T1, TCG_REG_T1, add_off);
     h->base = TCG_REG_T1;
 
-- 
2.34.1

Disconnect the layout of ArchCPU from TCG compilation.
Pass the relative offset of 'env' and 'neg.tlb.f' as a parameter.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-defs.h          | 39 +---------------------
 include/exec/tlb-common.h        | 56 ++++++++++++++++++++++++++++++++
 include/tcg/tcg.h                |  1 +
 accel/tcg/translate-all.c        |  2 ++
 tcg/tcg.c                        | 13 ++++++++
 tcg/aarch64/tcg-target.c.inc     |  7 ++--
 tcg/arm/tcg-target.c.inc         |  7 ++--
 tcg/i386/tcg-target.c.inc        |  9 ++---
 tcg/loongarch64/tcg-target.c.inc |  7 ++--
 tcg/mips/tcg-target.c.inc        |  7 ++--
 tcg/ppc/tcg-target.c.inc         |  7 ++--
 tcg/riscv/tcg-target.c.inc       |  7 ++--
 tcg/s390x/tcg-target.c.inc       |  7 ++--
 tcg/sparc64/tcg-target.c.inc     |  7 ++--
 14 files changed, 110 insertions(+), 66 deletions(-)
 create mode 100644 include/exec/tlb-common.h

diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -XXX,XX +XXX,XX @@
 #define NB_MMU_MODES 16
 
 #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG)
+#include "exec/tlb-common.h"
 
 /* use a fully associative victim tlb of 8 entries */
 #define CPU_VTLB_SIZE 8
 
-#define CPU_TLB_ENTRY_BITS 5
-
 #define CPU_TLB_DYN_MIN_BITS 6
 #define CPU_TLB_DYN_DEFAULT_BITS 8
 
@@ -XXX,XX +XXX,XX @@
 #  endif
 # endif
 
-/* Minimalized TLB entry for use by TCG fast path. */
-typedef union CPUTLBEntry {
-    struct {
-        uint64_t addr_read;
-        uint64_t addr_write;
-        uint64_t addr_code;
-        /*
-         * Addend to virtual address to get host address.  IO accesses
-         * use the corresponding iotlb value.
-         */
-        uintptr_t addend;
-    };
-    /*
-     * Padding to get a power of two size, as well as index
-     * access to addr_{read,write,code}.
-     */
-    uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
-} CPUTLBEntry;
-
-QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
-
 #endif  /* !CONFIG_USER_ONLY && CONFIG_TCG */
 
 #if !defined(CONFIG_USER_ONLY)
@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLBDesc {
     CPUTLBEntryFull *fulltlb;
 } CPUTLBDesc;
 
-/*
- * Data elements that are per MMU mode, accessed by the fast path.
- * The structure is aligned to aid loading the pair with one insn.
- */
-typedef struct CPUTLBDescFast {
-    /* Contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */
-    uintptr_t mask;
-    /* The array of tlb entries itself. */
-    CPUTLBEntry *table;
-} CPUTLBDescFast QEMU_ALIGNED(2 * sizeof(void *));
-
 /*
  * Data elements that are shared between all MMU modes.
  */
@@ -XXX,XX +XXX,XX @@ typedef struct CPUTLB {
     CPUTLBDescFast f[NB_MMU_MODES];
 } CPUTLB;
 
-/* This will be used by TCG backends to compute offsets.  */
-#define TLB_MASK_TABLE_OFS(IDX) \
-    ((int)offsetof(ArchCPU, neg.tlb.f[IDX]) - (int)offsetof(ArchCPU, env))
-
 #else
 
 typedef struct CPUTLB { } CPUTLB;
diff --git a/include/exec/tlb-common.h b/include/exec/tlb-common.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/exec/tlb-common.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Common definitions for the softmmu tlb
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef EXEC_TLB_COMMON_H
+#define EXEC_TLB_COMMON_H 1
+
+#define CPU_TLB_ENTRY_BITS 5
+
+/* Minimalized TLB entry for use by TCG fast path. */
+typedef union CPUTLBEntry {
+    struct {
+        uint64_t addr_read;
+        uint64_t addr_write;
+        uint64_t addr_code;
+        /*
+         * Addend to virtual address to get host address.  IO accesses
+         * use the corresponding iotlb value.
+         */
+        uintptr_t addend;
+    };
+    /*
+     * Padding to get a power of two size, as well as index
+     * access to addr_{read,write,code}.
+     */
+    uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
+} CPUTLBEntry;
+
+QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));
+
+/*
+ * Data elements that are per MMU mode, accessed by the fast path.
+ * The structure is aligned to aid loading the pair with one insn.
+ */
+typedef struct CPUTLBDescFast {
+    /* Contains (n_entries - 1) << CPU_TLB_ENTRY_BITS */
+    uintptr_t mask;
+    /* The array of tlb entries itself. */
+    CPUTLBEntry *table;
+} CPUTLBDescFast QEMU_ALIGNED(2 * sizeof(void *));
+
+#endif /* EXEC_TLB_COMMON_H */
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     TCGType addr_type;            /* TCG_TYPE_I32 or TCG_TYPE_I64 */
 
 #ifdef CONFIG_SOFTMMU
+    int tlb_fast_offset;
     int page_mask;
     uint8_t page_bits;
     uint8_t tlb_dyn_max_bits;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tcg_ctx->page_bits = TARGET_PAGE_BITS;
     tcg_ctx->page_mask = TARGET_PAGE_MASK;
     tcg_ctx->tlb_dyn_max_bits = CPU_TLB_DYN_MAX_BITS;
+    tcg_ctx->tlb_fast_offset =
+        (int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
 #endif
 
  tb_overflow:
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 #define NO_CPU_IO_DEFS
 
 #include "exec/exec-all.h"
+#include "exec/tlb-common.h"
 #include "tcg/tcg-op.h"
 
 #if UINTPTR_MAX == UINT32_MAX
@@ -XXX,XX +XXX,XX @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which)
     return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]);
 }
 
+#if defined(CONFIG_SOFTMMU) && !defined(CONFIG_TCG_INTERPRETER)
+static int tlb_mask_table_ofs(TCGContext *s, int which)
+{
+    return s->tlb_fast_offset + which * sizeof(CPUTLBDescFast);
+}
+#endif
+
 /* Signal overflow, starting over with fewer guest insns. */
 static G_NORETURN
 void tcg_raise_tb_overflow(TCGContext *s)
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
 
     tcg_debug_assert(s->addr_type == TCG_TYPE_I32 ||
                      s->addr_type == TCG_TYPE_I64);
+
+#if defined(CONFIG_SOFTMMU) && !defined(CONFIG_TCG_INTERPRETER)
+    tcg_debug_assert(s->tlb_fast_offset < 0);
+    tcg_debug_assert(s->tlb_fast_offset >= MIN_TLB_MASK_TABLE_OFS);
+#endif
 }
 
 static TCGTemp *tcg_temp_alloc(TCGContext *s)
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     return true;
 }
 
+/* We expect to use a 7-bit scaled negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -512
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                  ? TCG_TYPE_I64 : TCG_TYPE_I32);
 
     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -512);
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
     tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
-                 TLB_MASK_TABLE_OFS(mem_index), 1, 0);
+                 tlb_mask_table_ofs(s, mem_index), 1, 0);
 
     /* Extract the TLB index from the address into X0.  */
     tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     return true;
 }
 
+/* We expect to use an 9-bit sign-magnitude negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -256
+
 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                                            TCGReg addrlo, TCGReg addrhi,
                                            MemOpIdx oi, bool is_ld)
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     int mem_index = get_mmuidx(oi);
     int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
                         : offsetof(CPUTLBEntry, addr_write);
-    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_off = tlb_mask_table_ofs(s, mem_index);
     unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
     TCGReg t_addr;
 
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     ldst->addrhi_reg = addrhi;
 
     /* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}.  */
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
     tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline int setup_guest_base_seg(void)
 #endif /* setup_guest_base_seg */
 #endif /* !SOFTMMU */
 
+#define MIN_TLB_MASK_TABLE_OFS  INT_MIN
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     int trexw = 0, hrexw = 0, tlbrexw = 0;
     unsigned mem_index = get_mmuidx(oi);
     unsigned s_mask = (1 << s_bits) - 1;
+    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
     int tlb_mask;
 
     ldst = new_ldst_label(s);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
                    s->page_bits - CPU_TLB_ENTRY_BITS);
 
     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
-                         TLB_MASK_TABLE_OFS(mem_index) +
-                         offsetof(CPUTLBDescFast, mask));
+                         fast_ofs + offsetof(CPUTLBDescFast, mask));
 
     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
-                         TLB_MASK_TABLE_OFS(mem_index) +
-                         offsetof(CPUTLBDescFast, table));
+                         fast_ofs + offsetof(CPUTLBDescFast, table));
 
     /*
      * If the required alignment is at least as large as the access, simply
diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
     return false;
 }
 
+/* We expect to use a 12-bit negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -(1 << 11)
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 #ifdef CONFIG_SOFTMMU
     unsigned s_bits = opc & MO_SIZE;
     int mem_index = get_mmuidx(oi);
-    int fast_ofs = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
     int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask);
     int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table);
 
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     ldst->oi = oi;
     ldst->addrlo_reg = addr_reg;
 
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11));
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs);
 
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
     return false;
 }
 
+/* We expect to use a 16-bit negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -32768
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 #ifdef CONFIG_SOFTMMU
     unsigned s_mask = (1 << s_bits) - 1;
     int mem_index = get_mmuidx(oi);
-    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_off = tlb_mask_table_ofs(s, mem_index);
     int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
     int table_off = fast_off + offsetof(CPUTLBDescFast, table);
     int add_off = offsetof(CPUTLBEntry, addend);
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     ldst->addrhi_reg = addrhi;
 
     /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, TCG_AREG0, table_off);
 
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
     return aa.atom <= MO_64;
 }
 
+/* We expect to use a 16-bit negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -32768
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     int mem_index = get_mmuidx(oi);
     int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
                         : offsetof(CPUTLBEntry, addr_write);
-    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_off = tlb_mask_table_ofs(s, mem_index);
     int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
     int table_off = fast_off + offsetof(CPUTLBDescFast, table);
 
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     ldst->addrhi_reg = addrhi;
 
     /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -32768);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, mask_off);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_AREG0, table_off);
 
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
     return true;
 }
 
+/* We expect to use a 12-bit negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -(1 << 11)
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
     unsigned s_bits = opc & MO_SIZE;
     unsigned s_mask = (1u << s_bits) - 1;
     int mem_index = get_mmuidx(oi);
-    int fast_ofs = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_ofs = tlb_mask_table_ofs(s, mem_index);
     int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask);
     int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table);
     int compare_mask;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase,
     ldst->oi = oi;
     ldst->addrlo_reg = addr_reg;
 
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 11));
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs);
 
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
     return true;
 }
 
+/* We're expecting to use a 20-bit negative offset on the tlb memory ops.  */
+#define MIN_TLB_MASK_TABLE_OFS  -(1 << 19)
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 #ifdef CONFIG_SOFTMMU
     unsigned s_mask = (1 << s_bits) - 1;
     int mem_index = get_mmuidx(oi);
-    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_off = tlb_mask_table_ofs(s, mem_index);
     int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
     int table_off = fast_off + offsetof(CPUTLBDescFast, table);
     int ofs, a_off;
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     tcg_out_sh64(s, RSY_SRLG, TCG_TMP0, addr_reg, TCG_REG_NONE,
                  s->page_bits - CPU_TLB_ENTRY_BITS);
 
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 19));
     tcg_out_insn(s, RXY, NG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, mask_off);
     tcg_out_insn(s, RXY, AG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, table_off);
 
diff --git a/tcg/sparc64/tcg-target.c.inc b/tcg/sparc64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool tcg_target_has_memory_bswap(MemOp memop)
     return true;
 }
 
+/* We expect to use a 13-bit negative offset from ENV.  */
+#define MIN_TLB_MASK_TABLE_OFS  -(1 << 12)
+
 /*
  * For softmmu, perform the TLB load and compare.
  * For useronly, perform any required alignment tests.
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
 
 #ifdef CONFIG_SOFTMMU
     int mem_index = get_mmuidx(oi);
-    int fast_off = TLB_MASK_TABLE_OFS(mem_index);
+    int fast_off = tlb_mask_table_ofs(s, mem_index);
     int mask_off = fast_off + offsetof(CPUTLBDescFast, mask);
     int table_off = fast_off + offsetof(CPUTLBDescFast, table);
     int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
@@ -XXX,XX +XXX,XX @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
     int cc;
 
     /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx].  */
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
-    QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -(1 << 12));
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T2, TCG_AREG0, mask_off);
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_T3, TCG_AREG0, table_off);
 
-- 
2.34.1

This had been pulled in from tcg/tcg.h, via exec/cpu_ldst.h,
via exec/exec-all.h, but the include of tcg.h will be removed.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/avr/cpu.c      | 1 +
 target/rx/cpu.c       | 1 +
 target/rx/op_helper.c | 1 +
 target/tricore/cpu.c  | 1 +
 4 files changed, 4 insertions(+)

diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "cpu.h"
 #include "disas/dis-asm.h"
+#include "tcg/debug-assert.h"
 
 static void avr_cpu_set_pc(CPUState *cs, vaddr value)
 {
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "hw/loader.h"
 #include "fpu/softfloat.h"
+#include "tcg/debug-assert.h"
 
 static void rx_cpu_set_pc(CPUState *cs, vaddr value)
 {
diff --git a/target/rx/op_helper.c b/target/rx/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/op_helper.c
+++ b/target/rx/op_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "exec/cpu_ldst.h"
 #include "fpu/softfloat.h"
+#include "tcg/debug-assert.h"
 
 static inline G_NORETURN
 void raise_exception(CPURXState *env, int index,
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "qemu/error-report.h"
+#include "tcg/debug-assert.h"
 
 static inline void set_feature(CPUTriCoreState *env, int feature)
 {
-- 
2.34.1

This had been pulled in from exec/cpu_ldst.h, via exec/exec-all.h,
but the include of tcg.h will be removed.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/monitor.c             | 1 +
 accel/tcg/tcg-accel-ops-mttcg.c | 2 +-
 accel/tcg/tcg-accel-ops-rr.c    | 2 +-
 target/i386/helper.c            | 3 +++
 target/openrisc/sys_helper.c    | 1 +
 5 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/monitor.c b/accel/tcg/monitor.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/monitor.c
+++ b/accel/tcg/monitor.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/cpus.h"
 #include "sysemu/cpu-timers.h"
 #include "sysemu/tcg.h"
+#include "tcg/tcg.h"
 #include "internal.h"
 
 
diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/guest-random.h"
 #include "exec/exec-all.h"
 #include "hw/boards.h"
-
+#include "tcg/tcg.h"
 #include "tcg-accel-ops.h"
 #include "tcg-accel-ops-mttcg.h"
 
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/notify.h"
 #include "qemu/guest-random.h"
 #include "exec/exec-all.h"
-
+#include "tcg/tcg.h"
 #include "tcg-accel-ops.h"
 #include "tcg-accel-ops-rr.h"
 #include "tcg-accel-ops-icount.h"
diff --git a/target/i386/helper.c b/target/i386/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -XXX,XX +XXX,XX @@
 #include "monitor/monitor.h"
 #endif
 #include "qemu/log.h"
+#ifdef CONFIG_TCG
+#include "tcg/tcg.h"
+#endif
 
 void cpu_sync_avx_hflag(CPUX86State *env)
 {
diff --git a/target/openrisc/sys_helper.c b/target/openrisc/sys_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/sys_helper.c
+++ b/target/openrisc/sys_helper.c
@@ -XXX,XX +XXX,XX @@
 #ifndef CONFIG_USER_ONLY
 #include "hw/boards.h"
 #endif
+#include "tcg/tcg.h"
 
 #define TO_SPR(group, number) (((group) << 11) + (number))
 
-- 
2.34.1

Often, the only thing we need to know about the TCG host
is the register size.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h                     | 12 +-----------
 tcg/aarch64/tcg-target-reg-bits.h     | 12 ++++++++++++
 tcg/arm/tcg-target-reg-bits.h         | 12 ++++++++++++
 tcg/i386/tcg-target-reg-bits.h        | 16 ++++++++++++++++
 tcg/i386/tcg-target.h                 |  2 --
 tcg/loongarch64/tcg-target-reg-bits.h | 21 +++++++++++++++++++++
 tcg/loongarch64/tcg-target.h          | 11 -----------
 tcg/mips/tcg-target-reg-bits.h        | 18 ++++++++++++++++++
 tcg/mips/tcg-target.h                 |  8 --------
 tcg/ppc/tcg-target-reg-bits.h         | 16 ++++++++++++++++
 tcg/ppc/tcg-target.h                  |  5 -----
 tcg/riscv/tcg-target-reg-bits.h       | 19 +++++++++++++++++++
 tcg/riscv/tcg-target.h                |  9 ---------
 tcg/s390x/tcg-target-reg-bits.h       | 17 +++++++++++++++++
 tcg/sparc64/tcg-target-reg-bits.h     | 12 ++++++++++++
 tcg/tci/tcg-target-reg-bits.h         | 18 ++++++++++++++++++
 tcg/tci/tcg-target.h                  |  8 --------
 tcg/s390x/tcg-target.c.inc            |  5 -----
 18 files changed, 162 insertions(+), 59 deletions(-)
 create mode 100644 tcg/aarch64/tcg-target-reg-bits.h
 create mode 100644 tcg/arm/tcg-target-reg-bits.h
 create mode 100644 tcg/i386/tcg-target-reg-bits.h
 create mode 100644 tcg/loongarch64/tcg-target-reg-bits.h
 create mode 100644 tcg/mips/tcg-target-reg-bits.h
 create mode 100644 tcg/ppc/tcg-target-reg-bits.h
 create mode 100644 tcg/riscv/tcg-target-reg-bits.h
 create mode 100644 tcg/s390x/tcg-target-reg-bits.h
 create mode 100644 tcg/sparc64/tcg-target-reg-bits.h
 create mode 100644 tcg/tci/tcg-target-reg-bits.h

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/plugin.h"
 #include "qemu/queue.h"
 #include "tcg/tcg-mo.h"
+#include "tcg-target-reg-bits.h"
 #include "tcg-target.h"
 #include "tcg/tcg-cond.h"
 #include "tcg/debug-assert.h"
@@ -XXX,XX +XXX,XX @@
 #define CPU_TEMP_BUF_NLONGS 128
 #define TCG_STATIC_FRAME_SIZE  (CPU_TEMP_BUF_NLONGS * sizeof(long))
 
-/* Default target word size to pointer size.  */
-#ifndef TCG_TARGET_REG_BITS
-# if UINTPTR_MAX == UINT32_MAX
-#  define TCG_TARGET_REG_BITS 32
-# elif UINTPTR_MAX == UINT64_MAX
-#  define TCG_TARGET_REG_BITS 64
-# else
-#  error Unknown pointer size for tcg target
-# endif
-#endif
-
 #if TCG_TARGET_REG_BITS == 32
 typedef int32_t tcg_target_long;
 typedef uint32_t tcg_target_ulong;
diff --git a/tcg/aarch64/tcg-target-reg-bits.h b/tcg/aarch64/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/aarch64/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2023 Linaro
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#define TCG_TARGET_REG_BITS  64
+
+#endif
diff --git a/tcg/arm/tcg-target-reg-bits.h b/tcg/arm/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/arm/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2023 Linaro
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#define TCG_TARGET_REG_BITS  32
+
+#endif
diff --git a/tcg/i386/tcg-target-reg-bits.h b/tcg/i386/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/i386/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#ifdef __x86_64__
+# define TCG_TARGET_REG_BITS  64
+#else
+# define TCG_TARGET_REG_BITS  32
+#endif
+
+#endif
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_INSN_UNIT_SIZE  1
 
 #ifdef __x86_64__
-# define TCG_TARGET_REG_BITS  64
 # define TCG_TARGET_NB_REGS   32
 # define MAX_CODE_GEN_BUFFER_SIZE  (2 * GiB)
 #else
-# define TCG_TARGET_REG_BITS  32
 # define TCG_TARGET_NB_REGS   24
 # define MAX_CODE_GEN_BUFFER_SIZE  UINT32_MAX
 #endif
diff --git a/tcg/loongarch64/tcg-target-reg-bits.h b/tcg/loongarch64/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/loongarch64/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2021 WANG Xuerui <git@xen0n.name>
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+/*
+ * Loongson removed the (incomplete) 32-bit support from kernel and toolchain
+ * for the initial upstreaming of this architecture, so don't bother and just
+ * support the LP64* ABI for now.
+ */
+#if defined(__loongarch64)
+# define TCG_TARGET_REG_BITS 64
+#else
+# error unsupported LoongArch register size
+#endif
+
+#endif
diff --git a/tcg/loongarch64/tcg-target.h b/tcg/loongarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef LOONGARCH_TCG_TARGET_H
 #define LOONGARCH_TCG_TARGET_H
 
-/*
- * Loongson removed the (incomplete) 32-bit support from kernel and toolchain
- * for the initial upstreaming of this architecture, so don't bother and just
- * support the LP64* ABI for now.
- */
-#if defined(__loongarch64)
-# define TCG_TARGET_REG_BITS 64
-#else
-# error unsupported LoongArch register size
-#endif
-
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
 
diff --git a/tcg/mips/tcg-target-reg-bits.h b/tcg/mips/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/mips/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2008-2009 Arnaud Patard <arnaud.patard@rtp-net.org>
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#if _MIPS_SIM == _ABIO32
+# define TCG_TARGET_REG_BITS 32
+#elif _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64
+# define TCG_TARGET_REG_BITS 64
+#else
+# error "Unknown ABI"
+#endif
+
+#endif
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef MIPS_TCG_TARGET_H
 #define MIPS_TCG_TARGET_H
 
-#if _MIPS_SIM == _ABIO32
-# define TCG_TARGET_REG_BITS 32
-#elif _MIPS_SIM == _ABIN32 || _MIPS_SIM == _ABI64
-# define TCG_TARGET_REG_BITS 64
-#else
-# error "Unknown ABI"
-#endif
-
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
 
diff --git a/tcg/ppc/tcg-target-reg-bits.h b/tcg/ppc/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/ppc/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#ifdef _ARCH_PPC64
+# define TCG_TARGET_REG_BITS  64
+#else
+# define TCG_TARGET_REG_BITS  32
+#endif
+
+#endif
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef PPC_TCG_TARGET_H
 #define PPC_TCG_TARGET_H
 
-#ifdef _ARCH_PPC64
-# define TCG_TARGET_REG_BITS  64
-#else
-# define TCG_TARGET_REG_BITS  32
-#endif
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
 #define TCG_TARGET_NB_REGS 64
diff --git a/tcg/riscv/tcg-target-reg-bits.h b/tcg/riscv/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/riscv/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2018 SiFive, Inc
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+/*
+ * We don't support oversize guests.
+ * Since we will only build tcg once, this in turn requires a 64-bit host.
+ */
+#if __riscv_xlen != 64
+#error "unsupported code generation mode"
+#endif
+#define TCG_TARGET_REG_BITS 64
+
+#endif
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #ifndef RISCV_TCG_TARGET_H
 #define RISCV_TCG_TARGET_H
 
-/*
- * We don't support oversize guests.
- * Since we will only build tcg once, this in turn requires a 64-bit host.
- */
-#if __riscv_xlen != 64
-#error "unsupported code generation mode"
-#endif
-#define TCG_TARGET_REG_BITS 64
-
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_NB_REGS 32
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
diff --git a/tcg/s390x/tcg-target-reg-bits.h b/tcg/s390x/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/s390x/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2009 Ulrich Hecht <uli@suse.de>
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+/* We only support generating code for 64-bit mode.  */
+#if UINTPTR_MAX == UINT64_MAX
+# define TCG_TARGET_REG_BITS 64
+#else
+# error "unsupported code generation mode"
+#endif
+
+#endif
diff --git a/tcg/sparc64/tcg-target-reg-bits.h b/tcg/sparc64/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/sparc64/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2023 Linaro
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#define TCG_TARGET_REG_BITS  64
+
+#endif
diff --git a/tcg/tci/tcg-target-reg-bits.h b/tcg/tci/tcg-target-reg-bits.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/tci/tcg-target-reg-bits.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define target-specific register size
+ * Copyright (c) 2009, 2011 Stefan Weil
+ */
+
+#ifndef TCG_TARGET_REG_BITS_H
+#define TCG_TARGET_REG_BITS_H
+
+#if UINTPTR_MAX == UINT32_MAX
+# define TCG_TARGET_REG_BITS 32
+#elif UINTPTR_MAX == UINT64_MAX
+# define TCG_TARGET_REG_BITS 64
+#else
+# error Unknown pointer size for tci target
+#endif
+
+#endif
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)
 
-#if UINTPTR_MAX == UINT32_MAX
-# define TCG_TARGET_REG_BITS 32
-#elif UINTPTR_MAX == UINT64_MAX
-# define TCG_TARGET_REG_BITS 64
-#else
-# error Unknown pointer size for tci target
-#endif
-
 /* Optional instructions. */
 
 #define TCG_TARGET_HAS_bswap16_i32      1
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
-/* We only support generating code for 64-bit mode.  */
-#if TCG_TARGET_REG_BITS != 64
-#error "unsupported code generation mode"
-#endif
-
 #include "../tcg-ldst.c.inc"
 #include "../tcg-pool.c.inc"
 #include "elf.h"
-- 
2.34.1

The symbol is always defined, even if to 0.  We wanted to test for
TCG_OVERSIZED_GUEST == 0.

This fixed, the #error is reached while building arm-softmmu, because
TCG_OVERSIZED_GUEST is not true (nor supposed to be true) for arm32
guest on a 32-bit host.  But that's ok, because this feature doesn't
apply to arm32.  Add an #ifdef for TARGET_AARCH64.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/ptw.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -XXX,XX +XXX,XX @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val,
                              uint64_t new_val, S1Translate *ptw,
                              ARMMMUFaultInfo *fi)
 {
+#ifdef TARGET_AARCH64
     uint64_t cur_val;
     void *host = ptw->out_host;
 
@@ -XXX,XX +XXX,XX @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val,
      * we know that TCG_OVERSIZED_GUEST is set, which means that we are
      * running in round-robin mode and could only race with dma i/o.
      */
-#ifndef TCG_OVERSIZED_GUEST
+#if !TCG_OVERSIZED_GUEST
 # error "Unexpected configuration"
 #endif
     bool locked = qemu_mutex_iothread_locked();
@@ -XXX,XX +XXX,XX @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t old_val,
 #endif
 
     return cur_val;
+#else
+    /* AArch32 does not have FEAT_HADFS. */
+    g_assert_not_reached();
+#endif
 }
 
 static bool get_level1_table_address(CPUARMState *env, ARMMMUIdx mmu_idx,
-- 
2.34.1

Move a use of TARGET_LONG_BITS out of tcg/tcg.h.
Include the new file only where required.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu_ldst.h       |  3 +--
 include/tcg/oversized-guest.h | 23 +++++++++++++++++++++++
 include/tcg/tcg.h             |  9 ---------
 accel/tcg/cputlb.c            |  1 +
 accel/tcg/tcg-all.c           |  1 +
 target/arm/ptw.c              |  1 +
 target/riscv/cpu_helper.c     |  1 +
 7 files changed, 28 insertions(+), 11 deletions(-)
 create mode 100644 include/tcg/oversized-guest.h

diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@@ -XXX,XX +XXX,XX @@ static inline void clear_helper_retaddr(void)
 
 #else
 
-/* Needed for TCG_OVERSIZED_GUEST */
-#include "tcg/tcg.h"
+#include "tcg/oversized-guest.h"
 
 static inline target_ulong tlb_read_idx(const CPUTLBEntry *entry,
                                         MMUAccessType access_type)
diff --git a/include/tcg/oversized-guest.h b/include/tcg/oversized-guest.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/oversized-guest.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define TCG_OVERSIZED_GUEST
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef EXEC_TCG_OVERSIZED_GUEST_H
+#define EXEC_TCG_OVERSIZED_GUEST_H
+
+#include "tcg-target-reg-bits.h"
+#include "cpu-param.h"
+
+/*
+ * Oversized TCG guests make things like MTTCG hard
+ * as we can't use atomics for cputlb updates.
+ */
+#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+#define TCG_OVERSIZED_GUEST 1
+#else
+#define TCG_OVERSIZED_GUEST 0
+#endif
+
+#endif
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t tcg_target_ulong;
 #error unsupported
 #endif
 
-/* Oversized TCG guests make things like MTTCG hard
- * as we can't use atomics for cputlb updates.
- */
-#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
-#define TCG_OVERSIZED_GUEST 1
-#else
-#define TCG_OVERSIZED_GUEST 0
-#endif
-
 #if TCG_TARGET_NB_REGS <= 32
 typedef uint32_t TCGRegSet;
 #elif TCG_TARGET_NB_REGS <= 64
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/plugin-memory.h"
 #endif
 #include "tcg/tcg-ldst.h"
+#include "tcg/oversized-guest.h"
 #include "exec/helper-proto.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
diff --git a/accel/tcg/tcg-all.c b/accel/tcg/tcg-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-all.c
+++ b/accel/tcg/tcg-all.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/replay-core.h"
 #include "sysemu/cpu-timers.h"
 #include "tcg/tcg.h"
+#include "tcg/oversized-guest.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/accel.h"
diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "internals.h"
 #include "idau.h"
+#include "tcg/oversized-guest.h"
 
 
 typedef struct S1Translate {
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/cpu-timers.h"
 #include "cpu_bits.h"
 #include "debug.h"
+#include "tcg/oversized-guest.h"
 
 int riscv_cpu_mmu_index(CPURISCVState *env, bool ifetch)
 {
-- 
2.34.1

These two items are the last uses of TARGET_LONG_BITS within tcg.h,
and are more in common with the other "_tl" definitions within that file.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h        | 15 ++++++++++++++-
 include/tcg/tcg.h           | 19 -------------------
 target/mips/tcg/translate.h |  1 +
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_plugin_cb_end(void)
 }
 
 #if TARGET_LONG_BITS == 32
+typedef TCGv_i32 TCGv;
 #define tcg_temp_new() tcg_temp_new_i32()
 #define tcg_global_mem_new tcg_global_mem_new_i32
 #define tcg_temp_free tcg_temp_free_i32
 #define tcgv_tl_temp tcgv_i32_temp
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
-#else
+#elif TARGET_LONG_BITS == 64
+typedef TCGv_i64 TCGv;
 #define tcg_temp_new() tcg_temp_new_i64()
 #define tcg_global_mem_new tcg_global_mem_new_i64
 #define tcg_temp_free tcg_temp_free_i64
 #define tcgv_tl_temp tcgv_i64_temp
 #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
+#else
+#error Unhandled TARGET_LONG_BITS value
 #endif
 
 void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
 #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i64
 #define tcg_gen_dup_tl_vec  tcg_gen_dup_i64_vec
 #define tcg_gen_dup_tl tcg_gen_dup_i64
+#define dup_const_tl dup_const
 #else
 #define tcg_gen_movi_tl tcg_gen_movi_i32
 #define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -XXX,XX +XXX,XX @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
 #define tcg_gen_atomic_umax_fetch_tl tcg_gen_atomic_umax_fetch_i32
 #define tcg_gen_dup_tl_vec  tcg_gen_dup_i32_vec
 #define tcg_gen_dup_tl tcg_gen_dup_i32
+
+#define dup_const_tl(VECE, C)                                      \
+    (__builtin_constant_p(VECE)                                    \
+     ? (  (VECE) == MO_8  ? 0x01010101ul * (uint8_t)(C)            \
+        : (VECE) == MO_16 ? 0x00010001ul * (uint16_t)(C)           \
+        : (VECE) == MO_32 ? 0x00000001ul * (uint32_t)(C)           \
+        : (qemu_build_not_reached_always(), 0))                    \
+     :  (target_long)dup_const(VECE, C))
 #endif
 
 #if UINTPTR_MAX == UINT32_MAX
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGv_i128_d *TCGv_i128;
 typedef struct TCGv_ptr_d *TCGv_ptr;
 typedef struct TCGv_vec_d *TCGv_vec;
 typedef TCGv_ptr TCGv_env;
-#if TARGET_LONG_BITS == 32
-#define TCGv TCGv_i32
-#elif TARGET_LONG_BITS == 64
-#define TCGv TCGv_i64
-#else
-#error Unhandled TARGET_LONG_BITS value
-#endif
 
 /* call flags */
 /* Helper does not read globals (either directly or through an exception). It
@@ -XXX,XX +XXX,XX @@ uint64_t dup_const(unsigned vece, uint64_t c);
         : (qemu_build_not_reached_always(), 0))                    \
      : dup_const(VECE, C))
 
-#if TARGET_LONG_BITS == 64
-# define dup_const_tl  dup_const
-#else
-# define dup_const_tl(VECE, C)                                     \
-    (__builtin_constant_p(VECE)                                    \
-     ? (  (VECE) == MO_8  ? 0x01010101ul * (uint8_t)(C)            \
-        : (VECE) == MO_16 ? 0x00010001ul * (uint16_t)(C)           \
-        : (VECE) == MO_32 ? 0x00000001ul * (uint32_t)(C)           \
-        : (qemu_build_not_reached_always(), 0))                    \
-     :  (target_long)dup_const(VECE, C))
-#endif
-
 #ifdef CONFIG_DEBUG_TCG
 void tcg_assert_listed_vecop(TCGOpcode);
 #else
diff --git a/target/mips/tcg/translate.h b/target/mips/tcg/translate.h
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.h
+++ b/target/mips/tcg/translate.h
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/log.h"
 #include "exec/translator.h"
+#include "tcg/tcg-op.h"
 
 #define MIPS_DEBUG_DISAS 0
 
-- 
2.34.1

Create tcg/tcg-op-common.h, moving everything that does not concern
TARGET_LONG_BITS or TCGv.  Adjust tcg/*.c to use the new header
instead of tcg-op.h, in preparation for compiling tcg/ only once.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-common.h |  996 ++++++++++++++++++++++++++++++++++
 include/tcg/tcg-op.h        | 1004 +----------------------------------
 tcg/optimize.c              |    2 +-
 tcg/tcg-op-gvec.c           |    2 +-
 tcg/tcg-op-ldst.c           |    2 +-
 tcg/tcg-op-vec.c            |    2 +-
 tcg/tcg-op.c                |    2 +-
 tcg/tcg.c                   |    2 +-
 tcg/tci.c                   |    3 +-
 9 files changed, 1007 insertions(+), 1008 deletions(-)
 create mode 100644 include/tcg/tcg-op-common.h

diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/tcg-op-common.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Target independent opcode generation functions.
+ *
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef TCG_TCG_OP_COMMON_H
+#define TCG_TCG_OP_COMMON_H
+
+#include "tcg/tcg.h"
+#include "exec/helper-proto.h"
+#include "exec/helper-gen.h"
+
+/* Basic output routines.  Not for general consumption.  */
+
+void tcg_gen_op1(TCGOpcode, TCGArg);
+void tcg_gen_op2(TCGOpcode, TCGArg, TCGArg);
+void tcg_gen_op3(TCGOpcode, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
+void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
+
+void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
+void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
+void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
+
+static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
+{
+    tcg_gen_op1(opc, tcgv_i32_arg(a1));
+}
+
+static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 a1)
+{
+    tcg_gen_op1(opc, tcgv_i64_arg(a1));
+}
+
+static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg a1)
+{
+    tcg_gen_op1(opc, a1);
+}
+
+static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2)
+{
+    tcg_gen_op2(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2));
+}
+
+static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
+{
+    tcg_gen_op2(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2));
+}
+
+static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 a1, TCGArg a2)
+{
+    tcg_gen_op2(opc, tcgv_i32_arg(a1), a2);
+}
+
+static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 a1, TCGArg a2)
+{
+    tcg_gen_op2(opc, tcgv_i64_arg(a1), a2);
+}
+
+static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg a1, TCGArg a2)
+{
+    tcg_gen_op2(opc, a1, a2);
+}
+
+static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 a1,
+                                   TCGv_i32 a2, TCGv_i32 a3)
+{
+    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), tcgv_i32_arg(a3));
+}
+
+static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 a1,
+                                   TCGv_i64 a2, TCGv_i64 a3)
+{
+    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), tcgv_i64_arg(a3));
+}
+
+static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 a1,
+                                    TCGv_i32 a2, TCGArg a3)
+{
+    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3);
+}
+
+static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 a1,
+                                    TCGv_i64 a2, TCGArg a3)
+{
+    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3);
+}
+
+static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
+                                       TCGv_ptr base, TCGArg offset)
+{
+    tcg_gen_op3(opc, tcgv_i32_arg(val), tcgv_ptr_arg(base), offset);
+}
+
+static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
+                                       TCGv_ptr base, TCGArg offset)
+{
+    tcg_gen_op3(opc, tcgv_i64_arg(val), tcgv_ptr_arg(base), offset);
+}
+
+static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4)
+{
+    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), tcgv_i32_arg(a4));
+}
+
+static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4)
+{
+    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), tcgv_i64_arg(a4));
+}
+
+static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGArg a4)
+{
+    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), a4);
+}
+
+static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGArg a4)
+{
+    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), a4);
+}
+
+static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGArg a3, TCGArg a4)
+{
+    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3, a4);
+}
+
+static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGArg a3, TCGArg a4)
+{
+    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3, a4);
+}
+
+static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4, TCGv_i32 a5)
+{
+    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5));
+}
+
+static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4, TCGv_i64 a5)
+{
+    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5));
+}
+
+static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGv_i32 a4, TCGArg a5)
+{
+    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5);
+}
+
+static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGv_i64 a4, TCGArg a5)
+{
+    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5);
+}
+
+static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGv_i32 a3, TCGArg a4, TCGArg a5)
+{
+    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), a4, a5);
+}
+
+static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGv_i64 a3, TCGArg a4, TCGArg a5)
+{
+    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), a4, a5);
+}
+
+static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                   TCGv_i32 a3, TCGv_i32 a4,
+                                   TCGv_i32 a5, TCGv_i32 a6)
+{
+    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5),
+                tcgv_i32_arg(a6));
+}
+
+static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                   TCGv_i64 a3, TCGv_i64 a4,
+                                   TCGv_i64 a5, TCGv_i64 a6)
+{
+    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5),
+                tcgv_i64_arg(a6));
+}
+
+static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                    TCGv_i32 a3, TCGv_i32 a4,
+                                    TCGv_i32 a5, TCGArg a6)
+{
+    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5), a6);
+}
+
+static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                    TCGv_i64 a3, TCGv_i64 a4,
+                                    TCGv_i64 a5, TCGArg a6)
+{
+    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5), a6);
+}
+
+static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
+                                     TCGv_i32 a3, TCGv_i32 a4,
+                                     TCGArg a5, TCGArg a6)
+{
+    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
+                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5, a6);
+}
+
+static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
+                                     TCGv_i64 a3, TCGv_i64 a4,
+                                     TCGArg a5, TCGArg a6)
+{
+    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
+                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5, a6);
+}
+
+
+/* Generic ops.  */
+
+static inline void gen_set_label(TCGLabel *l)
+{
+    l->present = 1;
+    tcg_gen_op1(INDEX_op_set_label, label_arg(l));
+}
+
+void tcg_gen_br(TCGLabel *l);
+void tcg_gen_mb(TCGBar);
+
+/**
+ * tcg_gen_exit_tb() - output exit_tb TCG operation
+ * @tb: The TranslationBlock from which we are exiting
+ * @idx: Direct jump slot index, or exit request
+ *
+ * See tcg/README for more info about this TCG operation.
+ * See also tcg.h and the block comment above TB_EXIT_MASK.
+ *
+ * For a normal exit from the TB, back to the main loop, @tb should
+ * be NULL and @idx should be 0.  Otherwise, @tb should be valid and
+ * @idx should be one of the TB_EXIT_ values.
+ */
+void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx);
+
+/**
+ * tcg_gen_goto_tb() - output goto_tb TCG operation
+ * @idx: Direct jump slot index (0 or 1)
+ *
+ * See tcg/README for more info about this TCG operation.
+ *
+ * NOTE: In softmmu emulation, direct jumps with goto_tb are only safe within
+ * the pages this TB resides in because we don't take care of direct jumps when
+ * address mapping changes, e.g. in tlb_flush(). In user mode, there's only a
+ * static address translation, so the destination address is always valid, TBs
+ * are always invalidated properly, and direct jumps are reset when mapping
+ * changes.
+ */
+void tcg_gen_goto_tb(unsigned idx);
+
+/**
+ * tcg_gen_lookup_and_goto_ptr() - look up the current TB, jump to it if valid
+ * @addr: Guest address of the target TB
+ *
+ * If the TB is not valid, jump to the epilogue.
+ *
+ * This operation is optional. If the TCG backend does not implement goto_ptr,
+ * this op is equivalent to calling tcg_gen_exit_tb() with 0 as the argument.
+ */
+void tcg_gen_lookup_and_goto_ptr(void);
+
+static inline void tcg_gen_plugin_cb_start(unsigned from, unsigned type,
+                                           unsigned wr)
+{
+    tcg_gen_op3(INDEX_op_plugin_cb_start, from, type, wr);
+}
+
+static inline void tcg_gen_plugin_cb_end(void)
+{
+    tcg_emit_op(INDEX_op_plugin_cb_end, 0);
+}
+
+/* 32 bit ops */
+
+void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
+void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
+void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
+void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
+void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
+void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
+                           unsigned int ofs, unsigned int len);
+void tcg_gen_extract_i32(TCGv_i32 ret, TCGv_i32 arg,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_sextract_i32(TCGv_i32 ret, TCGv_i32 arg,
+                          unsigned int ofs, unsigned int len);
+void tcg_gen_extract2_i32(TCGv_i32 ret, TCGv_i32 al, TCGv_i32 ah,
+                          unsigned int ofs);
+void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, TCGLabel *);
+void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *);
+void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
+                         TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
+                          TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
+                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2);
+void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
+                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
+void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags);
+void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_abs_i32(TCGv_i32, TCGv_i32);
+
+/* Replicate a value of size @vece from @in to all the lanes in @out */
+void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in);
+
+static inline void tcg_gen_discard_i32(TCGv_i32 arg)
+{
+    tcg_gen_op1_i32(INDEX_op_discard, arg);
+}
+
+static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (ret != arg) {
+        tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
+    }
+}
+
+static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
+}
+
+static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_add_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
+{
+    tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_neg_i32) {
+        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
+    } else {
+        tcg_gen_subfi_i32(ret, 0, arg);
+    }
+}
+
+static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    if (TCG_TARGET_HAS_not_i32) {
+        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
+    } else {
+        tcg_gen_xori_i32(ret, arg, -1);
+    }
+}
+
+/* 64 bit ops */
+
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
+void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
+void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_clz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
+void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
+void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
+void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
+                           unsigned int ofs, unsigned int len);
+void tcg_gen_extract_i64(TCGv_i64 ret, TCGv_i64 arg,
+                         unsigned int ofs, unsigned int len);
+void tcg_gen_sextract_i64(TCGv_i64 ret, TCGv_i64 arg,
+                          unsigned int ofs, unsigned int len);
+void tcg_gen_extract2_i64(TCGv_i64 ret, TCGv_i64 al, TCGv_i64 ah,
+                          unsigned int ofs);
+void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *);
+void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *);
+void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
+                         TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
+                          TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
+                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2);
+void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
+                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
+void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
+void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
+void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_abs_i64(TCGv_i64, TCGv_i64);
+
+/* Replicate a value of size @vece from @in to all the lanes in @out */
+void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in);
+
+#if TCG_TARGET_REG_BITS == 64
+static inline void tcg_gen_discard_i64(TCGv_i64 arg)
+{
+    tcg_gen_op1_i64(INDEX_op_discard, arg);
+}
+
+static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (ret != arg) {
+        tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
+    }
+}
+
+static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld8u_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld8s_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld16u_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld16s_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld32u_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                     tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
+                                  tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
+}
+
+static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                   tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st8_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st16_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                    tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
+                                  tcg_target_long offset)
+{
+    tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_sub_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
+}
+
+static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+{
+    tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
+}
+#else /* TCG_TARGET_REG_BITS == 32 */
+void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+
+void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+
+void tcg_gen_discard_i64(TCGv_i64 arg);
+void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
+void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
+#endif /* TCG_TARGET_REG_BITS */
+
+static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    if (TCG_TARGET_HAS_neg_i64) {
+        tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
+    } else {
+        tcg_gen_subfi_i64(ret, 0, arg);
+    }
+}
+
+/* Size changing operations.  */
+
+void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
+void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
+void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
+void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
+void tcg_gen_extrh_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
+void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
+void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
+
+void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src);
+void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg);
+void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi);
+
+static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
+{
+    tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
+}
+
+/* Local load/store bit ops */
+
+void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
+void tcg_gen_qemu_st_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
+void tcg_gen_qemu_ld_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
+void tcg_gen_qemu_st_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
+void tcg_gen_qemu_ld_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
+void tcg_gen_qemu_st_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
+
+/* Atomic ops */
+
+void tcg_gen_atomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
+                                    TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
+                                    TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
+                                     TCGv_i128, TCGArg, MemOp, TCGType);
+
+void tcg_gen_nonatomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_nonatomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_nonatomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
+                                        TCGv_i128, TCGArg, MemOp, TCGType);
+
+void tcg_gen_atomic_xchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                 TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_xchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                 TCGArg, MemOp, TCGType);
+
+void tcg_gen_atomic_fetch_add_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_add_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_and_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_and_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_or_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                     TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_or_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                     TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_xor_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_xor_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_smin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_smin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_umin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_umin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_smax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_smax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_umax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_fetch_umax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+
+void tcg_gen_atomic_add_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_add_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_and_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_and_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_or_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                     TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_or_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                     TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_xor_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_xor_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                      TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_smin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_smin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_umin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_umin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_smax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_smax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_umax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
+                                       TCGArg, MemOp, TCGType);
+void tcg_gen_atomic_umax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
+                                       TCGArg, MemOp, TCGType);
+
+/* Vector ops */
+
+void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
+void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
+void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
+void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+
+void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_rotli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_rotri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+
+void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_rotls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+
+void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+void tcg_gen_rotlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+void tcg_gen_rotrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+
+void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
+                     TCGv_vec a, TCGv_vec b);
+
+void tcg_gen_bitsel_vec(unsigned vece, TCGv_vec r, TCGv_vec a,
+                        TCGv_vec b, TCGv_vec c);
+void tcg_gen_cmpsel_vec(TCGCond cond, unsigned vece, TCGv_vec r,
+                        TCGv_vec a, TCGv_vec b, TCGv_vec c, TCGv_vec d);
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
+
+/* Host pointer ops */
+
+#if UINTPTR_MAX == UINT32_MAX
+# define PTR  i32
+# define NAT  TCGv_i32
+#else
+# define PTR  i64
+# define NAT  TCGv_i64
+#endif
+
+static inline void tcg_gen_ld_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
+{
+    glue(tcg_gen_ld_,PTR)((NAT)r, a, o);
+}
+
+static inline void tcg_gen_st_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
+{
+    glue(tcg_gen_st_, PTR)((NAT)r, a, o);
+}
+
+static inline void tcg_gen_discard_ptr(TCGv_ptr a)
+{
+    glue(tcg_gen_discard_,PTR)((NAT)a);
+}
+
+static inline void tcg_gen_add_ptr(TCGv_ptr r, TCGv_ptr a, TCGv_ptr b)
+{
+    glue(tcg_gen_add_,PTR)((NAT)r, (NAT)a, (NAT)b);
+}
+
+static inline void tcg_gen_addi_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t b)
+{
+    glue(tcg_gen_addi_,PTR)((NAT)r, (NAT)a, b);
+}
+
+static inline void tcg_gen_mov_ptr(TCGv_ptr d, TCGv_ptr s)
+{
+    glue(tcg_gen_mov_,PTR)((NAT)d, (NAT)s);
+}
+
+static inline void tcg_gen_movi_ptr(TCGv_ptr d, intptr_t s)
+{
+    glue(tcg_gen_movi_,PTR)((NAT)d, s);
+}
+
+static inline void tcg_gen_brcondi_ptr(TCGCond cond, TCGv_ptr a,
+                                       intptr_t b, TCGLabel *label)
+{
+    glue(tcg_gen_brcondi_,PTR)(cond, (NAT)a, b, label);
+}
+
+static inline void tcg_gen_ext_i32_ptr(TCGv_ptr r, TCGv_i32 a)
+{
+#if UINTPTR_MAX == UINT32_MAX
+    tcg_gen_mov_i32((NAT)r, a);
+#else
+    tcg_gen_ext_i32_i64((NAT)r, a);
+#endif
+}
+
+static inline void tcg_gen_trunc_i64_ptr(TCGv_ptr r, TCGv_i64 a)
+{
+#if UINTPTR_MAX == UINT32_MAX
+    tcg_gen_extrl_i64_i32((NAT)r, a);
+#else
+    tcg_gen_mov_i64((NAT)r, a);
+#endif
+}
+
+static inline void tcg_gen_extu_ptr_i64(TCGv_i64 r, TCGv_ptr a)
+{
+#if UINTPTR_MAX == UINT32_MAX
+    tcg_gen_extu_i32_i64(r, (NAT)a);
+#else
+    tcg_gen_mov_i64(r, (NAT)a);
+#endif
+}
+
+static inline void tcg_gen_trunc_ptr_i32(TCGv_i32 r, TCGv_ptr a)
+{
+#if UINTPTR_MAX == UINT32_MAX
+    tcg_gen_mov_i32(r, (NAT)a);
+#else
+    tcg_gen_extrl_i64_i32(r, (NAT)a);
+#endif
+}
+
+#undef PTR
+#undef NAT
+
+#endif /* TCG_TCG_OP_COMMON_H */
diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
 /*
- * Tiny Code Generator for QEMU
+ * Target dependent opcode generation functions.
  *
  * Copyright (c) 2008 Fabrice Bellard
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
  */
 
 #ifndef TCG_TCG_OP_H
 #define TCG_TCG_OP_H
 
-#include "tcg/tcg.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
-
-/* Basic output routines.  Not for general consumption.  */
-
-void tcg_gen_op1(TCGOpcode, TCGArg);
-void tcg_gen_op2(TCGOpcode, TCGArg, TCGArg);
-void tcg_gen_op3(TCGOpcode, TCGArg, TCGArg, TCGArg);
-void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
-void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
-void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
-
-void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
-void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
-void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
-
-static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
-{
-    tcg_gen_op1(opc, tcgv_i32_arg(a1));
-}
-
-static inline void tcg_gen_op1_i64(TCGOpcode opc, TCGv_i64 a1)
-{
-    tcg_gen_op1(opc, tcgv_i64_arg(a1));
-}
-
-static inline void tcg_gen_op1i(TCGOpcode opc, TCGArg a1)
-{
-    tcg_gen_op1(opc, a1);
-}
-
-static inline void tcg_gen_op2_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2)
-{
-    tcg_gen_op2(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2));
-}
-
-static inline void tcg_gen_op2_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2)
-{
-    tcg_gen_op2(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2));
-}
-
-static inline void tcg_gen_op2i_i32(TCGOpcode opc, TCGv_i32 a1, TCGArg a2)
-{
-    tcg_gen_op2(opc, tcgv_i32_arg(a1), a2);
-}
-
-static inline void tcg_gen_op2i_i64(TCGOpcode opc, TCGv_i64 a1, TCGArg a2)
-{
-    tcg_gen_op2(opc, tcgv_i64_arg(a1), a2);
-}
-
-static inline void tcg_gen_op2ii(TCGOpcode opc, TCGArg a1, TCGArg a2)
-{
-    tcg_gen_op2(opc, a1, a2);
-}
-
-static inline void tcg_gen_op3_i32(TCGOpcode opc, TCGv_i32 a1,
-                                   TCGv_i32 a2, TCGv_i32 a3)
-{
-    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), tcgv_i32_arg(a3));
-}
-
-static inline void tcg_gen_op3_i64(TCGOpcode opc, TCGv_i64 a1,
-                                   TCGv_i64 a2, TCGv_i64 a3)
-{
-    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), tcgv_i64_arg(a3));
-}
-
-static inline void tcg_gen_op3i_i32(TCGOpcode opc, TCGv_i32 a1,
-                                    TCGv_i32 a2, TCGArg a3)
-{
-    tcg_gen_op3(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3);
-}
-
-static inline void tcg_gen_op3i_i64(TCGOpcode opc, TCGv_i64 a1,
-                                    TCGv_i64 a2, TCGArg a3)
-{
-    tcg_gen_op3(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3);
-}
-
-static inline void tcg_gen_ldst_op_i32(TCGOpcode opc, TCGv_i32 val,
-                                       TCGv_ptr base, TCGArg offset)
-{
-    tcg_gen_op3(opc, tcgv_i32_arg(val), tcgv_ptr_arg(base), offset);
-}
-
-static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, TCGv_i64 val,
-                                       TCGv_ptr base, TCGArg offset)
-{
-    tcg_gen_op3(opc, tcgv_i64_arg(val), tcgv_ptr_arg(base), offset);
-}
-
-static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                   TCGv_i32 a3, TCGv_i32 a4)
-{
-    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), tcgv_i32_arg(a4));
-}
-
-static inline void tcg_gen_op4_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                   TCGv_i64 a3, TCGv_i64 a4)
-{
-    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), tcgv_i64_arg(a4));
-}
-
-static inline void tcg_gen_op4i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                    TCGv_i32 a3, TCGArg a4)
-{
-    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), a4);
-}
-
-static inline void tcg_gen_op4i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                    TCGv_i64 a3, TCGArg a4)
-{
-    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), a4);
-}
-
-static inline void tcg_gen_op4ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                     TCGArg a3, TCGArg a4)
-{
-    tcg_gen_op4(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2), a3, a4);
-}
-
-static inline void tcg_gen_op4ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                     TCGArg a3, TCGArg a4)
-{
-    tcg_gen_op4(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2), a3, a4);
-}
-
-static inline void tcg_gen_op5_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                   TCGv_i32 a3, TCGv_i32 a4, TCGv_i32 a5)
-{
-    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5));
-}
-
-static inline void tcg_gen_op5_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                   TCGv_i64 a3, TCGv_i64 a4, TCGv_i64 a5)
-{
-    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5));
-}
-
-static inline void tcg_gen_op5i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                    TCGv_i32 a3, TCGv_i32 a4, TCGArg a5)
-{
-    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5);
-}
-
-static inline void tcg_gen_op5i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                    TCGv_i64 a3, TCGv_i64 a4, TCGArg a5)
-{
-    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5);
-}
-
-static inline void tcg_gen_op5ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                     TCGv_i32 a3, TCGArg a4, TCGArg a5)
-{
-    tcg_gen_op5(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), a4, a5);
-}
-
-static inline void tcg_gen_op5ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                     TCGv_i64 a3, TCGArg a4, TCGArg a5)
-{
-    tcg_gen_op5(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), a4, a5);
-}
-
-static inline void tcg_gen_op6_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                   TCGv_i32 a3, TCGv_i32 a4,
-                                   TCGv_i32 a5, TCGv_i32 a6)
-{
-    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5),
-                tcgv_i32_arg(a6));
-}
-
-static inline void tcg_gen_op6_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                   TCGv_i64 a3, TCGv_i64 a4,
-                                   TCGv_i64 a5, TCGv_i64 a6)
-{
-    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5),
-                tcgv_i64_arg(a6));
-}
-
-static inline void tcg_gen_op6i_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                    TCGv_i32 a3, TCGv_i32 a4,
-                                    TCGv_i32 a5, TCGArg a6)
-{
-    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), tcgv_i32_arg(a4), tcgv_i32_arg(a5), a6);
-}
-
-static inline void tcg_gen_op6i_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                    TCGv_i64 a3, TCGv_i64 a4,
-                                    TCGv_i64 a5, TCGArg a6)
-{
-    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), tcgv_i64_arg(a4), tcgv_i64_arg(a5), a6);
-}
-
-static inline void tcg_gen_op6ii_i32(TCGOpcode opc, TCGv_i32 a1, TCGv_i32 a2,
-                                     TCGv_i32 a3, TCGv_i32 a4,
-                                     TCGArg a5, TCGArg a6)
-{
-    tcg_gen_op6(opc, tcgv_i32_arg(a1), tcgv_i32_arg(a2),
-                tcgv_i32_arg(a3), tcgv_i32_arg(a4), a5, a6);
-}
-
-static inline void tcg_gen_op6ii_i64(TCGOpcode opc, TCGv_i64 a1, TCGv_i64 a2,
-                                     TCGv_i64 a3, TCGv_i64 a4,
-                                     TCGArg a5, TCGArg a6)
-{
-    tcg_gen_op6(opc, tcgv_i64_arg(a1), tcgv_i64_arg(a2),
-                tcgv_i64_arg(a3), tcgv_i64_arg(a4), a5, a6);
-}
-
-
-/* Generic ops.  */
-
-static inline void gen_set_label(TCGLabel *l)
-{
-    l->present = 1;
-    tcg_gen_op1(INDEX_op_set_label, label_arg(l));
-}
-
-void tcg_gen_br(TCGLabel *l);
-void tcg_gen_mb(TCGBar);
-
-/* Helper calls. */
-
-/* 32 bit ops */
-
-void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
-void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
-void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_eqv_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_nand_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_nor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_orc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
-void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
-void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
-void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
-                         unsigned int ofs, unsigned int len);
-void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
-                           unsigned int ofs, unsigned int len);
-void tcg_gen_extract_i32(TCGv_i32 ret, TCGv_i32 arg,
-                         unsigned int ofs, unsigned int len);
-void tcg_gen_sextract_i32(TCGv_i32 ret, TCGv_i32 arg,
-                          unsigned int ofs, unsigned int len);
-void tcg_gen_extract2_i32(TCGv_i32 ret, TCGv_i32 al, TCGv_i32 ah,
-                          unsigned int ofs);
-void tcg_gen_brcond_i32(TCGCond cond, TCGv_i32 arg1, TCGv_i32 arg2, TCGLabel *);
-void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *);
-void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
-                         TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
-                          TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1,
-                         TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2);
-void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
-void tcg_gen_sub2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al,
-                      TCGv_i32 ah, TCGv_i32 bl, TCGv_i32 bh);
-void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_mulsu2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags);
-void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg);
-void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_abs_i32(TCGv_i32, TCGv_i32);
-
-/* Replicate a value of size @vece from @in to all the lanes in @out */
-void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in);
-
-static inline void tcg_gen_discard_i32(TCGv_i32 arg)
-{
-    tcg_gen_op1_i32(INDEX_op_discard, arg);
-}
-
-static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (ret != arg) {
-        tcg_gen_op2_i32(INDEX_op_mov_i32, ret, arg);
-    }
-}
-
-static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_ld8u_i32, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld8s_i32(TCGv_i32 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_ld8s_i32, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld16u_i32(TCGv_i32 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_ld16u_i32, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld16s_i32(TCGv_i32 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_ld16s_i32, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld_i32(TCGv_i32 ret, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_ld_i32, ret, arg2, offset);
-}
-
-static inline void tcg_gen_st8_i32(TCGv_i32 arg1, TCGv_ptr arg2,
-                                   tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_st8_i32, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_st16_i32(TCGv_i32 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_st16_i32, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_st_i32(TCGv_i32 arg1, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i32(INDEX_op_st_i32, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_add_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_add_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_sub_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_sub_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_and_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_and_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_or_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_or_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_xor_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_xor_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_shl_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_shr_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_sar_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_sar_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_mul_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
-{
-    tcg_gen_op3_i32(INDEX_op_mul_i32, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_neg_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_neg_i32) {
-        tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg);
-    } else {
-        tcg_gen_subfi_i32(ret, 0, arg);
-    }
-}
-
-static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
-{
-    if (TCG_TARGET_HAS_not_i32) {
-        tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg);
-    } else {
-        tcg_gen_xori_i32(ret, arg, -1);
-    }
-}
-
-/* 64 bit ops */
-
-void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
-void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
-void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_andc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_eqv_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_nand_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_nor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_orc_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_clz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
-void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
-void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
-void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
-                         unsigned int ofs, unsigned int len);
-void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
-                           unsigned int ofs, unsigned int len);
-void tcg_gen_extract_i64(TCGv_i64 ret, TCGv_i64 arg,
-                         unsigned int ofs, unsigned int len);
-void tcg_gen_sextract_i64(TCGv_i64 ret, TCGv_i64 arg,
-                          unsigned int ofs, unsigned int len);
-void tcg_gen_extract2_i64(TCGv_i64 ret, TCGv_i64 al, TCGv_i64 ah,
-                          unsigned int ofs);
-void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *);
-void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *);
-void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
-                         TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
-                          TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1,
-                         TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2);
-void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
-void tcg_gen_sub2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al,
-                      TCGv_i64 ah, TCGv_i64 bl, TCGv_i64 bh);
-void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_mulsu2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ext8s_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ext16s_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
-void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
-void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_abs_i64(TCGv_i64, TCGv_i64);
-
-/* Replicate a value of size @vece from @in to all the lanes in @out */
-void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in);
-
-#if TCG_TARGET_REG_BITS == 64
-static inline void tcg_gen_discard_i64(TCGv_i64 arg)
-{
-    tcg_gen_op1_i64(INDEX_op_discard, arg);
-}
-
-static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (ret != arg) {
-        tcg_gen_op2_i64(INDEX_op_mov_i64, ret, arg);
-    }
-}
-
-static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld8u_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld8s_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld16u_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld16s_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld32u_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                     tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld32s_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
-}
-
-static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                   tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_st8_i64, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_st16_i64, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                    tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_st32_i64, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2,
-                                  tcg_target_long offset)
-{
-    tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
-}
-
-static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_sub_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_and_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_or_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_xor_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_shl_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_shr_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_sar_i64, ret, arg1, arg2);
-}
-
-static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
-{
-    tcg_gen_op3_i64(INDEX_op_mul_i64, ret, arg1, arg2);
-}
-#else /* TCG_TARGET_REG_BITS == 32 */
-void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_st16_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_st32_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-
-void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-
-void tcg_gen_discard_i64(TCGv_i64 arg);
-void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_ld16s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_ld32u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_ld32s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_st_i64(TCGv_i64 arg1, TCGv_ptr arg2, tcg_target_long offset);
-void tcg_gen_and_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_or_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_xor_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_shl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_shr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_sar_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-#endif /* TCG_TARGET_REG_BITS */
-
-static inline void tcg_gen_neg_i64(TCGv_i64 ret, TCGv_i64 arg)
-{
-    if (TCG_TARGET_HAS_neg_i64) {
-        tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg);
-    } else {
-        tcg_gen_subfi_i64(ret, 0, arg);
-    }
-}
-
-/* Size changing operations.  */
-
-void tcg_gen_extu_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
-void tcg_gen_ext_i32_i64(TCGv_i64 ret, TCGv_i32 arg);
-void tcg_gen_concat_i32_i64(TCGv_i64 dest, TCGv_i32 low, TCGv_i32 high);
-void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
-void tcg_gen_extrh_i64_i32(TCGv_i32 ret, TCGv_i64 arg);
-void tcg_gen_extr_i64_i32(TCGv_i32 lo, TCGv_i32 hi, TCGv_i64 arg);
-void tcg_gen_extr32_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i64 arg);
-
-void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src);
-void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg);
-void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi);
-
-static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi)
-{
-    tcg_gen_deposit_i64(ret, lo, hi, 32, 32);
-}
-
-/* QEMU specific operations.  */
+#include "tcg/tcg-op-common.h"
 
 #ifndef TARGET_LONG_BITS
 #error must include QEMU headers
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
 # error "Unhandled number of operands to insn_start"
 #endif
 
-/**
- * tcg_gen_exit_tb() - output exit_tb TCG operation
- * @tb: The TranslationBlock from which we are exiting
- * @idx: Direct jump slot index, or exit request
- *
- * See tcg/README for more info about this TCG operation.
- * See also tcg.h and the block comment above TB_EXIT_MASK.
- *
- * For a normal exit from the TB, back to the main loop, @tb should
- * be NULL and @idx should be 0.  Otherwise, @tb should be valid and
- * @idx should be one of the TB_EXIT_ values.
- */
-void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx);
-
-/**
- * tcg_gen_goto_tb() - output goto_tb TCG operation
- * @idx: Direct jump slot index (0 or 1)
- *
- * See tcg/README for more info about this TCG operation.
- *
- * NOTE: In softmmu emulation, direct jumps with goto_tb are only safe within
- * the pages this TB resides in because we don't take care of direct jumps when
- * address mapping changes, e.g. in tlb_flush(). In user mode, there's only a
- * static address translation, so the destination address is always valid, TBs
- * are always invalidated properly, and direct jumps are reset when mapping
- * changes.
- */
-void tcg_gen_goto_tb(unsigned idx);
-
-/**
- * tcg_gen_lookup_and_goto_ptr() - look up the current TB, jump to it if valid
- * @addr: Guest address of the target TB
- *
- * If the TB is not valid, jump to the epilogue.
- *
- * This operation is optional. If the TCG backend does not implement goto_ptr,
- * this op is equivalent to calling tcg_gen_exit_tb() with 0 as the argument.
- */
-void tcg_gen_lookup_and_goto_ptr(void);
-
-static inline void tcg_gen_plugin_cb_start(unsigned from, unsigned type,
-                                           unsigned wr)
-{
-    tcg_gen_op3(INDEX_op_plugin_cb_start, from, type, wr);
-}
-
-static inline void tcg_gen_plugin_cb_end(void)
-{
-    tcg_emit_op(INDEX_op_plugin_cb_end, 0);
-}
-
 #if TARGET_LONG_BITS == 32
 typedef TCGv_i32 TCGv;
 #define tcg_temp_new() tcg_temp_new_i32()
@@ -XXX,XX +XXX,XX @@ typedef TCGv_i64 TCGv;
 #error Unhandled TARGET_LONG_BITS value
 #endif
 
-void tcg_gen_qemu_ld_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
-void tcg_gen_qemu_st_i32_chk(TCGv_i32, TCGTemp *, TCGArg, MemOp, TCGType);
-void tcg_gen_qemu_ld_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
-void tcg_gen_qemu_st_i64_chk(TCGv_i64, TCGTemp *, TCGArg, MemOp, TCGType);
-void tcg_gen_qemu_ld_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
-void tcg_gen_qemu_st_i128_chk(TCGv_i128, TCGTemp *, TCGArg, MemOp, TCGType);
-
 static inline void
 tcg_gen_qemu_ld_i32(TCGv_i32 v, TCGv a, TCGArg i, MemOp m)
 {
@@ -XXX,XX +XXX,XX @@ tcg_gen_qemu_st_i128(TCGv_i128 v, TCGv a, TCGArg i, MemOp m)
     tcg_gen_qemu_st_i128_chk(v, tcgv_tl_temp(a), i, m, TCG_TYPE_TL);
 }
 
-void tcg_gen_atomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
-                                    TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
-                                    TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
-                                     TCGv_i128, TCGArg, MemOp, TCGType);
-
-void tcg_gen_nonatomic_cmpxchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_nonatomic_cmpxchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_nonatomic_cmpxchg_i128_chk(TCGv_i128, TCGTemp *, TCGv_i128,
-                                        TCGv_i128, TCGArg, MemOp, TCGType);
-
-void tcg_gen_atomic_xchg_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                 TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_xchg_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                 TCGArg, MemOp, TCGType);
-
-void tcg_gen_atomic_fetch_add_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_add_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_and_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_and_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_or_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                     TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_or_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                     TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_xor_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_xor_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_smin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_smin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_umin_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_umin_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_smax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_smax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_umax_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_fetch_umax_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-
-void tcg_gen_atomic_add_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_add_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_and_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_and_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_or_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                     TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_or_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                     TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_xor_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_xor_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                      TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_smin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_smin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_umin_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_umin_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_smax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_smax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_umax_fetch_i32_chk(TCGv_i32, TCGTemp *, TCGv_i32,
-                                       TCGArg, MemOp, TCGType);
-void tcg_gen_atomic_umax_fetch_i64_chk(TCGv_i64, TCGTemp *, TCGv_i64,
-                                       TCGArg, MemOp, TCGType);
-
 #define DEF_ATOMIC2(N, S)                                               \
     static inline void N##_##S(TCGv_##S r, TCGv a, TCGv_##S v,          \
                                TCGArg i, MemOp m)                       \
@@ -XXX,XX +XXX,XX @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64)
 #undef DEF_ATOMIC2
 #undef DEF_ATOMIC3
 
-void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
-void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
-void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
-void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
-void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
-void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
-void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
-void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
-void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
-
-void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-void tcg_gen_rotli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-void tcg_gen_rotri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
-
-void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-void tcg_gen_rotls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
-
-void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-void tcg_gen_rotlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-void tcg_gen_rotrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
-
-void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
-                     TCGv_vec a, TCGv_vec b);
-
-void tcg_gen_bitsel_vec(unsigned vece, TCGv_vec r, TCGv_vec a,
-                        TCGv_vec b, TCGv_vec c);
-void tcg_gen_cmpsel_vec(TCGCond cond, unsigned vece, TCGv_vec r,
-                        TCGv_vec a, TCGv_vec b, TCGv_vec c, TCGv_vec d);
-
-void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
-void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
-void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
-
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -XXX,XX +XXX,XX @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
         : (VECE) == MO_32 ? 0x00000001ul * (uint32_t)(C)           \
         : (qemu_build_not_reached_always(), 0))                    \
      :  (target_long)dup_const(VECE, C))
-#endif
-
-#if UINTPTR_MAX == UINT32_MAX
-# define PTR  i32
-# define NAT  TCGv_i32
-#else
-# define PTR  i64
-# define NAT  TCGv_i64
-#endif
-
-static inline void tcg_gen_ld_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
-{
-    glue(tcg_gen_ld_,PTR)((NAT)r, a, o);
-}
-
-static inline void tcg_gen_st_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t o)
-{
-    glue(tcg_gen_st_, PTR)((NAT)r, a, o);
-}
-
-static inline void tcg_gen_discard_ptr(TCGv_ptr a)
-{
-    glue(tcg_gen_discard_,PTR)((NAT)a);
-}
-
-static inline void tcg_gen_add_ptr(TCGv_ptr r, TCGv_ptr a, TCGv_ptr b)
-{
-    glue(tcg_gen_add_,PTR)((NAT)r, (NAT)a, (NAT)b);
-}
-
-static inline void tcg_gen_addi_ptr(TCGv_ptr r, TCGv_ptr a, intptr_t b)
-{
-    glue(tcg_gen_addi_,PTR)((NAT)r, (NAT)a, b);
-}
-
-static inline void tcg_gen_mov_ptr(TCGv_ptr d, TCGv_ptr s)
-{
-    glue(tcg_gen_mov_,PTR)((NAT)d, (NAT)s);
-}
-
-static inline void tcg_gen_movi_ptr(TCGv_ptr d, intptr_t s)
-{
-    glue(tcg_gen_movi_,PTR)((NAT)d, s);
-}
-
-static inline void tcg_gen_brcondi_ptr(TCGCond cond, TCGv_ptr a,
-                                       intptr_t b, TCGLabel *label)
-{
-    glue(tcg_gen_brcondi_,PTR)(cond, (NAT)a, b, label);
-}
-
-static inline void tcg_gen_ext_i32_ptr(TCGv_ptr r, TCGv_i32 a)
-{
-#if UINTPTR_MAX == UINT32_MAX
-    tcg_gen_mov_i32((NAT)r, a);
-#else
-    tcg_gen_ext_i32_i64((NAT)r, a);
-#endif
-}
-
-static inline void tcg_gen_trunc_i64_ptr(TCGv_ptr r, TCGv_i64 a)
-{
-#if UINTPTR_MAX == UINT32_MAX
-    tcg_gen_extrl_i64_i32((NAT)r, a);
-#else
-    tcg_gen_mov_i64((NAT)r, a);
-#endif
-}
-
-static inline void tcg_gen_extu_ptr_i64(TCGv_i64 r, TCGv_ptr a)
-{
-#if UINTPTR_MAX == UINT32_MAX
-    tcg_gen_extu_i32_i64(r, (NAT)a);
-#else
-    tcg_gen_mov_i64(r, (NAT)a);
-#endif
-}
-
-static inline void tcg_gen_trunc_ptr_i32(TCGv_i32 r, TCGv_ptr a)
-{
-#if UINTPTR_MAX == UINT32_MAX
-    tcg_gen_mov_i32(r, (NAT)a);
-#else
-    tcg_gen_extrl_i64_i32(r, (NAT)a);
-#endif
-}
-
-#undef PTR
-#undef NAT
 
+#endif /* TARGET_LONG_BITS == 64 */
 #endif /* TCG_TCG_OP_H */
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 #include "qemu/int128.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-common.h"
 #include "tcg-internal.h"
 
 #define CASE_OP_32_64(x)                        \
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-common.h"
 #include "tcg/tcg-op-gvec.h"
 #include "tcg/tcg-gvec-desc.h"
 
diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-common.h"
 #include "tcg/tcg-mo.h"
 #include "exec/plugin-gen.h"
 #include "tcg-internal.h"
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-common.h"
 #include "tcg/tcg-mo.h"
 #include "tcg-internal.h"
 
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-common.h"
 #include "exec/plugin-gen.h"
 #include "tcg-internal.h"
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/exec-all.h"
 #include "exec/tlb-common.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-common.h"
 
 #if UINTPTR_MAX == UINT32_MAX
 # define ELF_CLASS  ELFCLASS32
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "exec/cpu_ldst.h"
-#include "tcg/tcg-op.h"
+#include "tcg/tcg.h"
 #include "tcg/tcg-ldst.h"
 #include <ffi.h>
 
-- 
2.34.1

This had been included via tcg-op-common.h via tcg-op.h,
but that is going away.

It is needed for inlines within translator.h, so we might as well
do it there and not individually in each translator c file.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/translate.h     | 1 +
 target/arm/tcg/translate-a64.c | 2 --
 target/arm/tcg/translate-sme.c | 1 -
 target/arm/tcg/translate-sve.c | 2 --
 target/arm/tcg/translate.c     | 2 --
 5 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -XXX,XX +XXX,XX @@
 #define TARGET_ARM_TRANSLATE_H
 
 #include "exec/translator.h"
+#include "exec/helper-gen.h"
 #include "internals.h"
 
 
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/host-utils.h"
 #include "semihosting/semihost.h"
 #include "exec/gen-icount.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
 #include "exec/log.h"
 #include "cpregs.h"
 #include "translate-a64.h"
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op-gvec.h"
 #include "tcg/tcg-gvec-desc.h"
 #include "translate.h"
-#include "exec/helper-gen.h"
 #include "translate-a64.h"
 #include "fpu/softfloat.h"
 
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@@ -XXX,XX +XXX,XX @@
 #include "arm_ldst.h"
 #include "translate.h"
 #include "internals.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
 #include "exec/log.h"
 #include "translate-a64.h"
 #include "fpu/softfloat.h"
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/bitops.h"
 #include "arm_ldst.h"
 #include "semihosting/semihost.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
 #include "exec/log.h"
 #include "cpregs.h"
 
-- 
2.34.1

This had been included via tcg-op-common.h via tcg-op.h,
but that is going away.  In idef-parser.y, shuffle some
tcg related includes into a more logical order.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/genptr.c                  | 1 +
 target/hexagon/translate.c               | 1 +
 target/hexagon/idef-parser/idef-parser.y | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -XXX,XX +XXX,XX @@
 #include "internal.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
+#include "exec/helper-gen.h"
 #include "insn.h"
 #include "opcodes.h"
 #include "translate.h"
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
+#include "exec/helper-gen.h"
 #include "exec/cpu_ldst.h"
 #include "exec/log.h"
 #include "internal.h"
diff --git a/target/hexagon/idef-parser/idef-parser.y b/target/hexagon/idef-parser/idef-parser.y
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/idef-parser/idef-parser.y
+++ b/target/hexagon/idef-parser/idef-parser.y
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
     fputs("#include \"qemu/log.h\"\n", output_file);
     fputs("#include \"cpu.h\"\n", output_file);
     fputs("#include \"internal.h\"\n", output_file);
+    fputs("#include \"tcg/tcg.h\"\n", output_file);
     fputs("#include \"tcg/tcg-op.h\"\n", output_file);
+    fputs("#include \"exec/helper-gen.h\"\n", output_file);
     fputs("#include \"insn.h\"\n", output_file);
     fputs("#include \"opcodes.h\"\n", output_file);
     fputs("#include \"translate.h\"\n", output_file);
     fputs("#define QEMU_GENERATE\n", output_file);
     fputs("#include \"genptr.h\"\n", output_file);
-    fputs("#include \"tcg/tcg.h\"\n", output_file);
     fputs("#include \"macros.h\"\n", output_file);
     fprintf(output_file, "#include \"%s\"\n", argv[ARG_INDEX_EMITTER_H]);
 
-- 
2.34.1

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-head.h | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
-/* Helper file for declaring TCG helper functions.
-   Used by other helper files.
-
-   Targets should use DEF_HELPER_N and DEF_HELPER_FLAGS_N to declare helper
-   functions.  Names should be specified without the helper_ prefix, and
-   the return and argument types specified.  3 basic types are understood
-   (i32, i64 and ptr).  Additional aliases are provided for convenience and
-   to match the types used by the C helper implementation.
-
-   The target helper.h should be included in all files that use/define
-   helper functions.  THis will ensure that function prototypes are
-   consistent.  In addition it should be included an extra two times for
-   helper.c, defining:
-    GEN_HELPER 1 to produce op generation functions (gen_helper_*)
-    GEN_HELPER 2 to do runtime registration helper functions.
+/*
+ * Helper file for declaring TCG helper functions.
+ * Used by other helper files.
  */
 
 #ifndef EXEC_HELPER_HEAD_H
-- 
2.34.1

This will be required outside of tcg-internal.h soon.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/helper-info.h | 59 +++++++++++++++++++++++++++++++++++++++
 tcg/tcg-internal.h        | 47 +------------------------------
 2 files changed, 60 insertions(+), 46 deletions(-)
 create mode 100644 include/tcg/helper-info.h

diff --git a/include/tcg/helper-info.h b/include/tcg/helper-info.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/helper-info.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * TCG Helper Infomation Structure
+ *
+ * Copyright (c) 2023 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef TCG_HELPER_INFO_H
+#define TCG_HELPER_INFO_H
+
+#ifdef CONFIG_TCG_INTERPRETER
+#include <ffi.h>
+#endif
+
+/*
+ * Describe the calling convention of a given argument type.
+ */
+typedef enum {
+    TCG_CALL_RET_NORMAL,         /* by registers */
+    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
+    TCG_CALL_RET_BY_VEC,         /* for i128, by vector register */
+} TCGCallReturnKind;
+
+typedef enum {
+    TCG_CALL_ARG_NORMAL,         /* by registers (continuing onto stack) */
+    TCG_CALL_ARG_EVEN,           /* like normal, but skipping odd slots */
+    TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
+    TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
+    TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
+    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
+    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
+} TCGCallArgumentKind;
+
+typedef struct TCGCallArgumentLoc {
+    TCGCallArgumentKind kind    : 8;
+    unsigned arg_slot           : 8;
+    unsigned ref_slot           : 8;
+    unsigned arg_idx            : 4;
+    unsigned tmp_subindex       : 2;
+} TCGCallArgumentLoc;
+
+typedef struct TCGHelperInfo {
+    void *func;
+    const char *name;
+#ifdef CONFIG_TCG_INTERPRETER
+    ffi_cif *cif;
+#endif
+    unsigned typemask           : 32;
+    unsigned flags              : 8;
+    unsigned nr_in              : 8;
+    unsigned nr_out             : 8;
+    TCGCallReturnKind out_kind  : 8;
+
+    /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
+    TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
+} TCGHelperInfo;
+
+#endif /* TCG_HELPER_INFO_H */
diff --git a/tcg/tcg-internal.h b/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-internal.h
+++ b/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_INTERNAL_H
 #define TCG_INTERNAL_H
 
-#ifdef CONFIG_TCG_INTERPRETER
-#include <ffi.h>
-#endif
+#include "tcg/helper-info.h"
 
 #define TCG_HIGHWATER 1024
 
-/*
- * Describe the calling convention of a given argument type.
- */
-typedef enum {
-    TCG_CALL_RET_NORMAL,         /* by registers */
-    TCG_CALL_RET_BY_REF,         /* for i128, by reference */
-    TCG_CALL_RET_BY_VEC,         /* for i128, by vector register */
-} TCGCallReturnKind;
-
-typedef enum {
-    TCG_CALL_ARG_NORMAL,         /* by registers (continuing onto stack) */
-    TCG_CALL_ARG_EVEN,           /* like normal, but skipping odd slots */
-    TCG_CALL_ARG_EXTEND,         /* for i32, as a sign/zero-extended i64 */
-    TCG_CALL_ARG_EXTEND_U,       /*      ... as a zero-extended i64 */
-    TCG_CALL_ARG_EXTEND_S,       /*      ... as a sign-extended i64 */
-    TCG_CALL_ARG_BY_REF,         /* for i128, by reference, first */
-    TCG_CALL_ARG_BY_REF_N,       /*       ... by reference, subsequent */
-} TCGCallArgumentKind;
-
-typedef struct TCGCallArgumentLoc {
-    TCGCallArgumentKind kind    : 8;
-    unsigned arg_slot           : 8;
-    unsigned ref_slot           : 8;
-    unsigned arg_idx            : 4;
-    unsigned tmp_subindex       : 2;
-} TCGCallArgumentLoc;
-
-typedef struct TCGHelperInfo {
-    void *func;
-    const char *name;
-#ifdef CONFIG_TCG_INTERPRETER
-    ffi_cif *cif;
-#endif
-    unsigned typemask           : 32;
-    unsigned flags              : 8;
-    unsigned nr_in              : 8;
-    unsigned nr_out             : 8;
-    TCGCallReturnKind out_kind  : 8;
-
-    /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
-    TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
-} TCGHelperInfo;
-
 extern TCGContext tcg_init_ctx;
 extern TCGContext **tcg_ctxs;
 extern unsigned int tcg_cur_ctxs;
-- 
2.34.1

In preparation for compiling tcg/ only once, eliminate
the all_helpers array.  Instantiate the info structs for
the generic helpers in accel/tcg/, and the structs for
the target-specific helpers in each translate.c.

Since we don't see all of the info structs at startup,
initialize at first use, using g_once_init_* to make
sure we don't race while doing so.

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: include/exec/exec-all.h
 F: include/exec/tb-flush.h
 F: include/exec/target_long.h
 F: include/exec/helper*.h
+F: include/exec/helper-info.c.inc
 F: include/sysemu/cpus.h
 F: include/sysemu/tcg.h
 F: include/hw/core/tcg-cpu-ops.h
diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-gen.h
+++ b/include/exec/helper-gen.h
@@ -XXX,XX +XXX,XX @@
-/* Helper file for declaring TCG helper functions.
-   This one expands generation functions for tcg opcodes.  */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Helper file for declaring TCG helper functions.
+ * This one expands generation functions for tcg opcodes.
+ * Define HELPER_H for the header file to be expanded,
+ * and static inline to change from global file scope.
+ */
 
 #ifndef HELPER_GEN_H
 #define HELPER_GEN_H
 
+#include "tcg/tcg.h"
+#include "tcg/helper-info.h"
 #include "exec/helper-head.h"
 
 #define DEF_HELPER_FLAGS_0(name, flags, ret)                            \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret))        \
 {                                                                       \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 0, NULL);                 \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 0, NULL);  \
 }
 
 #define DEF_HELPER_FLAGS_1(name, flags, ret, t1)                        \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1))                                                 \
 {                                                                       \
-  TCGTemp *args[1] = { dh_arg(t1, 1) };                                 \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 1, args);                 \
+    TCGTemp *args[1] = { dh_arg(t1, 1) };                               \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 1, args);  \
 }
 
 #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2)                    \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2))                             \
 {                                                                       \
-  TCGTemp *args[2] = { dh_arg(t1, 1), dh_arg(t2, 2) };                  \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 2, args);                 \
+    TCGTemp *args[2] = { dh_arg(t1, 1), dh_arg(t2, 2) };                \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 2, args);  \
 }
 
 #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3)                \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3))         \
 {                                                                       \
-  TCGTemp *args[3] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3) };   \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 3, args);                 \
+    TCGTemp *args[3] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3) }; \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 3, args);  \
 }
 
 #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4)            \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2),                             \
     dh_arg_decl(t3, 3), dh_arg_decl(t4, 4))                             \
 {                                                                       \
-  TCGTemp *args[4] = { dh_arg(t1, 1), dh_arg(t2, 2),                    \
-                     dh_arg(t3, 3), dh_arg(t4, 4) };                    \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 4, args);                 \
+    TCGTemp *args[4] = { dh_arg(t1, 1), dh_arg(t2, 2),                  \
+                         dh_arg(t3, 3), dh_arg(t4, 4) };                \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 4, args);  \
 }
 
 #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5)        \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
-    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
     dh_arg_decl(t4, 4), dh_arg_decl(t5, 5))                             \
 {                                                                       \
-  TCGTemp *args[5] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
-                     dh_arg(t4, 4), dh_arg(t5, 5) };                    \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 5, args);                 \
+    TCGTemp *args[5] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
+                         dh_arg(t4, 4), dh_arg(t5, 5) };                \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 5, args);  \
 }
 
 #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6)    \
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
-    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
     dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6))         \
 {                                                                       \
-  TCGTemp *args[6] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
-                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6) };     \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 6, args);                 \
+    TCGTemp *args[6] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
+                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6) }; \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 6, args);  \
 }
 
 #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
+extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
-    dh_arg_decl(t1, 1),  dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),        \
+    dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
     dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
     dh_arg_decl(t7, 7))                                                 \
 {                                                                       \
-  TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),     \
-                     dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),       \
-                     dh_arg(t7, 7) };                                   \
-  tcg_gen_callN(HELPER(name), dh_retvar(ret), 7, args);                 \
+    TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
+                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),   \
+                         dh_arg(t7, 7) };                               \
+    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 7, args);  \
 }
 
 #include "helper.h"
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
 #undef DEF_HELPER_FLAGS_5
 #undef DEF_HELPER_FLAGS_6
 #undef DEF_HELPER_FLAGS_7
-#undef GEN_HELPER
 
 #endif /* HELPER_GEN_H */
diff --git a/include/exec/helper-tcg.h b/include/exec/helper-tcg.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/include/exec/helper-tcg.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/* Helper file for declaring TCG helper functions.
-   This one defines data structures private to tcg.c.  */
-
-#ifndef HELPER_TCG_H
-#define HELPER_TCG_H
-
-#include "exec/helper-head.h"
-
-/* Need one more level of indirection before stringification
-   to get all the macros expanded first.  */
-#define str(s) #s
-
-#define DEF_HELPER_FLAGS_0(NAME, FLAGS, ret) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) },
-
-#define DEF_HELPER_FLAGS_1(NAME, FLAGS, ret, t1) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) },
-
-#define DEF_HELPER_FLAGS_2(NAME, FLAGS, ret, t1, t2) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
-    | dh_typemask(t2, 2) },
-
-#define DEF_HELPER_FLAGS_3(NAME, FLAGS, ret, t1, t2, t3) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
-    | dh_typemask(t2, 2) | dh_typemask(t3, 3) },
-
-#define DEF_HELPER_FLAGS_4(NAME, FLAGS, ret, t1, t2, t3, t4) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
-    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) },
-
-#define DEF_HELPER_FLAGS_5(NAME, FLAGS, ret, t1, t2, t3, t4, t5) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
-    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
-    | dh_typemask(t5, 5) },
-
-#define DEF_HELPER_FLAGS_6(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6) \
-  { .func = HELPER(NAME), .name = str(NAME), \
-    .flags = FLAGS | dh_callflag(ret), \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
-    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
-    | dh_typemask(t5, 5) | dh_typemask(t6, 6) },
-
-#define DEF_HELPER_FLAGS_7(NAME, FLAGS, ret, t1, t2, t3, t4, t5, t6, t7) \
-  { .func = HELPER(NAME), .name = str(NAME), .flags = FLAGS, \
-    .typemask = dh_typemask(ret, 0) | dh_typemask(t1, 1) \
-    | dh_typemask(t2, 2) | dh_typemask(t3, 3) | dh_typemask(t4, 4) \
-    | dh_typemask(t5, 5) | dh_typemask(t6, 6) | dh_typemask(t7, 7) },
-
-#include "helper.h"
-#include "accel/tcg/tcg-runtime.h"
-#include "accel/tcg/plugin-helpers.h"
-
-#undef str
-#undef DEF_HELPER_FLAGS_0
-#undef DEF_HELPER_FLAGS_1
-#undef DEF_HELPER_FLAGS_2
-#undef DEF_HELPER_FLAGS_3
-#undef DEF_HELPER_FLAGS_4
-#undef DEF_HELPER_FLAGS_5
-#undef DEF_HELPER_FLAGS_6
-#undef DEF_HELPER_FLAGS_7
-
-#endif /* HELPER_TCG_H */
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -XXX,XX +XXX,XX @@ typedef struct ReservedRegion ReservedRegion;
 typedef struct SavedIOTLB SavedIOTLB;
 typedef struct SHPCDevice SHPCDevice;
 typedef struct SSIBus SSIBus;
+typedef struct TCGHelperInfo TCGHelperInfo;
 typedef struct TranslationBlock TranslationBlock;
 typedef struct VirtIODevice VirtIODevice;
 typedef struct Visitor Visitor;
diff --git a/include/tcg/helper-info.h b/include/tcg/helper-info.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/helper-info.h
+++ b/include/tcg/helper-info.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGCallArgumentLoc {
     unsigned tmp_subindex       : 2;
 } TCGCallArgumentLoc;
 
-typedef struct TCGHelperInfo {
+struct TCGHelperInfo {
     void *func;
     const char *name;
+
+    /* Used with g_once_init_enter. */
 #ifdef CONFIG_TCG_INTERPRETER
     ffi_cif *cif;
+#else
+    uintptr_t init;
 #endif
+
     unsigned typemask           : 32;
     unsigned flags              : 8;
     unsigned nr_in              : 8;
@@ -XXX,XX +XXX,XX @@ typedef struct TCGHelperInfo {
 
     /* Maximum physical arguments are constrained by TCG_TYPE_I128. */
     TCGCallArgumentLoc in[MAX_CALL_IARGS * (128 / TCG_TARGET_REG_BITS)];
-} TCGHelperInfo;
+};
 
 #endif /* TCG_HELPER_INFO_H */
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTargetOpDef {
 
 bool tcg_op_supported(TCGOpcode op);
 
-void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args);
+void tcg_gen_callN(TCGHelperInfo *, TCGTemp *ret, int nargs, TCGTemp **args);
 
 TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs);
 void tcg_op_remove(TCGContext *s, TCGOp *op);
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "exec/plugin-gen.h"
 #include "exec/translator.h"
+#include "exec/helper-proto.h"
+
+#define HELPER_H  "accel/tcg/plugin-helpers.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
 
 #ifdef CONFIG_SOFTMMU
 # define CONFIG_SOFTMMU_GATE 1
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "tcg/tcg.h"
 
+#define HELPER_H  "accel/tcg/tcg-runtime.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 /* 32-bit helpers */
 
 int32_t HELPER(div_i32)(int32_t arg1, int32_t arg2)
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
 
 #undef ALPHA_DEBUG_DISAS
 #define CONFIG_SOFTFLOAT_INLINE
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "cpregs.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
 
 #define ENABLE_ARCH_4T    arm_dc_feature(s, ARM_FEATURE_V4T)
 #define ENABLE_ARCH_5     arm_dc_feature(s, ARM_FEATURE_V5)
diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "exec/gen-icount.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 /*
  *  Define if you want a BREAK instruction translated to a breakpoint
  *  Active debugging connection is assumed
diff --git a/target/cris/translate.c b/target/cris/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "crisv32-decode.h"
 #include "qemu/qemu-print.h"
-
 #include "exec/helper-gen.h"
-
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 
 #define DISAS_CRIS 0
 #if DISAS_CRIS
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "genptr.h"
 #include "printinsn.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 #include "analyze_funcs_generated.c.inc"
 
 typedef void (*AnalyzeInsn)(DisasContext *ctx);
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 /* Since we have a distinction between register size and address size,
    we need to redefine all of these.  */
 
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 #define PREFIX_REPZ   0x01
 #define PREFIX_REPNZ  0x02
 #define PREFIX_LOCK   0x04
diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/translate.c
+++ b/target/loongarch/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_lladdr, cpu_llval;
 
 #include "exec/gen-icount.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 #define DISAS_STOP        DISAS_TARGET_0
 #define DISAS_EXIT        DISAS_TARGET_1
 #define DISAS_EXIT_UPDATE DISAS_TARGET_2
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "fpu/softfloat.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
 
 //#define DEBUG_DISPATCH 1
 
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 #define EXTRACT_FIELD(src, start, end) \
             (((src) >> start) & ((1 << (end - start + 1)) - 1))
 
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "fpu_helper.h"
 #include "translate.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 /*
  * Many sysemu-only helpers are not reachable for user-only.
  * Define stub generators here, so that we need not either sprinkle
diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/gen-icount.h"
 #include "semihosting/semihost.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 /* is_jmp field values */
 #define DISAS_UPDATE  DISAS_TARGET_1 /* cpu state was modified dynamically */
 
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 /* is_jmp field values */
 #define DISAS_EXIT    DISAS_TARGET_0  /* force exit to main loop */
 #define DISAS_JUMP    DISAS_TARGET_1  /* exit via jmp_pc/jmp_pc_imm */
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/qemu-print.h"
 #include "qapi/error.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 #define CPU_SINGLE_STEP 0x1
 #define CPU_BRANCH_STEP 0x2
 
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "instmap.h"
 #include "internals.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 /* global register indices */
 static TCGv cpu_gpr[32], cpu_gprh[32], cpu_pc, cpu_vl, cpu_vstart;
 static TCGv_i64 cpu_fpr[32]; /* assume F and D extensions */
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 typedef struct DisasContext {
     DisasContextBase base;
     CPURXState *env;
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "qemu/atomic128.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 
 /* Information that (most) every instruction needs to manipulate.  */
 typedef struct DisasContext DisasContext;
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "qemu/qemu-print.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 
 typedef struct DisasContext {
     DisasContextBase base;
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/log.h"
 #include "asi.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
 
 #define DYNAMIC_PC  1 /* dynamic pc value */
 #define JUMP_PC     2 /* dynamic pc value which takes only two values
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/translator.h"
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
+
 /*
  * TCG registers
  */
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/log.h"
 
+#define HELPER_H "helper.h"
+#include "exec/helper-info.c.inc"
+#undef  HELPER_H
+
 
 struct DisasContext {
     DisasContextBase base;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_pool_reset(TCGContext *s)
     s->pool_current = NULL;
 }
 
-#include "exec/helper-proto.h"
-
-static TCGHelperInfo all_helpers[] = {
-#include "exec/helper-tcg.h"
-};
-static GHashTable *helper_table;
-
 /*
  * Create TCGHelperInfo structures for "tcg/tcg-ldst.h" functions,
  * akin to what "exec/helper-tcg.h" does with DEF_HELPER_FLAGS_N.
@@ -XXX,XX +XXX,XX @@ static ffi_type *typecode_to_ffi(int argmask)
     g_assert_not_reached();
 }
 
-static void init_ffi_layouts(void)
+static ffi_cif *init_ffi_layout(TCGHelperInfo *info)
 {
-    /* g_direct_hash/equal for direct comparisons on uint32_t.  */
-    GHashTable *ffi_table = g_hash_table_new(NULL, NULL);
+    unsigned typemask = info->typemask;
+    struct {
+        ffi_cif cif;
+        ffi_type *args[];
+    } *ca;
+    ffi_status status;
+    int nargs;
 
-    for (int i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
-        TCGHelperInfo *info = &all_helpers[i];
-        unsigned typemask = info->typemask;
-        gpointer hash = (gpointer)(uintptr_t)typemask;
-        struct {
-            ffi_cif cif;
-            ffi_type *args[];
-        } *ca;
-        ffi_status status;
-        int nargs;
-        ffi_cif *cif;
+    /* Ignoring the return type, find the last non-zero field. */
+    nargs = 32 - clz32(typemask >> 3);
+    nargs = DIV_ROUND_UP(nargs, 3);
+    assert(nargs <= MAX_CALL_IARGS);
 
-        cif = g_hash_table_lookup(ffi_table, hash);
-        if (cif) {
-            info->cif = cif;
-            continue;
+    ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
+    ca->cif.rtype = typecode_to_ffi(typemask & 7);
+    ca->cif.nargs = nargs;
+
+    if (nargs != 0) {
+        ca->cif.arg_types = ca->args;
+        for (int j = 0; j < nargs; ++j) {
+            int typecode = extract32(typemask, (j + 1) * 3, 3);
+            ca->args[j] = typecode_to_ffi(typecode);
         }
-
-        /* Ignoring the return type, find the last non-zero field. */
-        nargs = 32 - clz32(typemask >> 3);
-        nargs = DIV_ROUND_UP(nargs, 3);
-        assert(nargs <= MAX_CALL_IARGS);
-
-        ca = g_malloc0(sizeof(*ca) + nargs * sizeof(ffi_type *));
-        ca->cif.rtype = typecode_to_ffi(typemask & 7);
-        ca->cif.nargs = nargs;
-
-        if (nargs != 0) {
-            ca->cif.arg_types = ca->args;
-            for (int j = 0; j < nargs; ++j) {
-                int typecode = extract32(typemask, (j + 1) * 3, 3);
-                ca->args[j] = typecode_to_ffi(typecode);
-            }
-        }
-
-        status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
-                              ca->cif.rtype, ca->cif.arg_types);
-        assert(status == FFI_OK);
-
-        cif = &ca->cif;
-        info->cif = cif;
-        g_hash_table_insert(ffi_table, hash, (gpointer)cif);
     }
 
-    g_hash_table_destroy(ffi_table);
+    status = ffi_prep_cif(&ca->cif, FFI_DEFAULT_ABI, nargs,
+                          ca->cif.rtype, ca->cif.arg_types);
+    assert(status == FFI_OK);
+
+    return &ca->cif;
 }
+
+#define HELPER_INFO_INIT(I)      (&(I)->cif)
+#define HELPER_INFO_INIT_VAL(I)  init_ffi_layout(I)
+#else
+#define HELPER_INFO_INIT(I)      (&(I)->init)
+#define HELPER_INFO_INIT_VAL(I)  1
 #endif /* CONFIG_TCG_INTERPRETER */
 
 static inline bool arg_slot_reg_p(unsigned arg_slot)
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
         args_ct += n;
     }
 
-    /* Register helpers.  */
-    /* Use g_direct_hash/equal for direct pointer comparisons on func.  */
-    helper_table = g_hash_table_new(NULL, NULL);
-
-    for (i = 0; i < ARRAY_SIZE(all_helpers); ++i) {
-        init_call_layout(&all_helpers[i]);
-        g_hash_table_insert(helper_table, (gpointer)all_helpers[i].func,
-                            (gpointer)&all_helpers[i]);
-    }
-
     init_call_layout(&info_helper_ld32_mmu);
     init_call_layout(&info_helper_ld64_mmu);
     init_call_layout(&info_helper_ld128_mmu);
@@ -XXX,XX +XXX,XX @@ static void tcg_context_init(unsigned max_cpus)
     init_call_layout(&info_helper_st64_mmu);
     init_call_layout(&info_helper_st128_mmu);
 
-#ifdef CONFIG_TCG_INTERPRETER
-    init_ffi_layouts();
-#endif
-
     tcg_target_init(s);
     process_op_defs(s);
 
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
 
 static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs);
 
-void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
+void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, int nargs, TCGTemp **args)
 {
-    const TCGHelperInfo *info;
     TCGv_i64 extend_free[MAX_CALL_IARGS];
     int n_extend = 0;
     TCGOp *op;
     int i, n, pi = 0, total_args;
 
-    info = g_hash_table_lookup(helper_table, (gpointer)func);
+    if (unlikely(g_once_init_enter(HELPER_INFO_INIT(info)))) {
+        init_call_layout(info);
+        g_once_init_leave(HELPER_INFO_INIT(info), HELPER_INFO_INIT_VAL(info));
+    }
+
     total_args = info->nr_out + info->nr_in + 2;
     op = tcg_op_alloc(INDEX_op_call, total_args);
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
             g_assert_not_reached();
         }
     }
-    op->args[pi++] = (uintptr_t)func;
+    op->args[pi++] = (uintptr_t)info->func;
     op->args[pi++] = (uintptr_t)info;
     tcg_debug_assert(pi == total_args);
 
diff --git a/include/exec/helper-info.c.inc b/include/exec/helper-info.c.inc
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/exec/helper-info.c.inc
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Helper file for declaring TCG helper functions.
+ * This one expands info structures for tcg helpers.
+ * Define HELPER_H for the header file to be expanded.
+ */
+
+#include "tcg/tcg.h"
+#include "tcg/helper-info.h"
+#include "exec/helper-head.h"
+
+/*
+ * Need one more level of indirection before stringification
+ * to get all the macros expanded first.
+ */
+#define str(s) #s
+
+#define DEF_HELPER_FLAGS_0(NAME, FLAGS, RET)                            \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0)                                 \
+    };
+
+#define DEF_HELPER_FLAGS_1(NAME, FLAGS, RET, T1)                        \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+    };
+
+#define DEF_HELPER_FLAGS_2(NAME, FLAGS, RET, T1, T2)                    \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+                  | dh_typemask(T2, 2)                                  \
+    };
+
+#define DEF_HELPER_FLAGS_3(NAME, FLAGS, RET, T1, T2, T3)                \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
+    };
+
+#define DEF_HELPER_FLAGS_4(NAME, FLAGS, RET, T1, T2, T3, T4)            \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
+                  | dh_typemask(T4, 4)                                  \
+    };
+
+#define DEF_HELPER_FLAGS_5(NAME, FLAGS, RET, T1, T2, T3, T4, T5)        \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
+                  | dh_typemask(T4, 4) | dh_typemask(T5, 5)             \
+    };
+
+#define DEF_HELPER_FLAGS_6(NAME, FLAGS, RET, T1, T2, T3, T4, T5, T6)    \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
+                  | dh_typemask(T4, 4) | dh_typemask(T5, 5)             \
+                  | dh_typemask(T6, 6)                                  \
+    };
+
+#define DEF_HELPER_FLAGS_7(NAME, FLAGS, RET, T1, T2, T3, T4, T5, T6, T7) \
+    TCGHelperInfo glue(helper_info_, NAME) = {                          \
+        .func = HELPER(NAME), .name = str(NAME),                        \
+        .flags = FLAGS | dh_callflag(RET),                              \
+        .typemask = dh_typemask(RET, 0) | dh_typemask(T1, 1)            \
+                  | dh_typemask(T2, 2) | dh_typemask(T3, 3)             \
+                  | dh_typemask(T4, 4) | dh_typemask(T5, 5)             \
+                  | dh_typemask(T6, 6) | dh_typemask(T7, 7)             \
+    };
+
+#include HELPER_H
+
+#undef str
+#undef DEF_HELPER_FLAGS_0
+#undef DEF_HELPER_FLAGS_1
+#undef DEF_HELPER_FLAGS_2
+#undef DEF_HELPER_FLAGS_3
+#undef DEF_HELPER_FLAGS_4
+#undef DEF_HELPER_FLAGS_5
+#undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
-- 
2.34.1

Removes a multiplicity of calls to __assert_fail, saving up
to 360kiB of .text space as measured on an x86_64 host.

Old     New     Less    %Change
9257272	8888680	368592	3.98%	qemu-system-aarch64
6100968	5911832	189136	3.10%	qemu-system-riscv64
5839112	5707032	132080	2.26%	qemu-system-mips
4447608	4341752	105856	2.38%	qemu-system-s390x

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 30 ++++++++++++++++--------------
 tcg/tcg.c         | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline void *tcg_splitwx_to_rw(const void *rx)
 }
 #endif
 
-static inline size_t temp_idx(TCGTemp *ts)
-{
-    ptrdiff_t n = ts - tcg_ctx->temps;
-    tcg_debug_assert(n >= 0 && n < tcg_ctx->nb_temps);
-    return n;
-}
-
 static inline TCGArg temp_arg(TCGTemp *ts)
 {
     return (uintptr_t)ts;
@@ -XXX,XX +XXX,XX @@ static inline TCGTemp *arg_temp(TCGArg a)
     return (TCGTemp *)(uintptr_t)a;
 }
 
-/* Using the offset of a temporary, relative to TCGContext, rather than
-   its index means that we don't use 0.  That leaves offset 0 free for
-   a NULL representation without having to leave index 0 unused.  */
+#ifdef CONFIG_DEBUG_TCG
+size_t temp_idx(TCGTemp *ts);
+TCGTemp *tcgv_i32_temp(TCGv_i32 v);
+#else
+static inline size_t temp_idx(TCGTemp *ts)
+{
+    return ts - tcg_ctx->temps;
+}
+
+/*
+ * Using the offset of a temporary, relative to TCGContext, rather than
+ * its index means that we don't use 0.  That leaves offset 0 free for
+ * a NULL representation without having to leave index 0 unused.
+ */
 static inline TCGTemp *tcgv_i32_temp(TCGv_i32 v)
 {
-    uintptr_t o = (uintptr_t)v;
-    TCGTemp *t = (void *)tcg_ctx + o;
-    tcg_debug_assert(offsetof(TCGContext, temps[temp_idx(t)]) == o);
-    return t;
+    return (void *)tcg_ctx + (uintptr_t)v;
 }
+#endif
 
 static inline TCGTemp *tcgv_i64_temp(TCGv_i64 v)
 {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val)
     return tcg_constant_vec(t->base_type, vece, val);
 }
 
+#ifdef CONFIG_DEBUG_TCG
+size_t temp_idx(TCGTemp *ts)
+{
+    ptrdiff_t n = ts - tcg_ctx->temps;
+    assert(n >= 0 && n < tcg_ctx->nb_temps);
+    return n;
+}
+
+TCGTemp *tcgv_i32_temp(TCGv_i32 v)
+{
+    uintptr_t o = (uintptr_t)v - offsetof(TCGContext, temps);
+
+    assert(o < sizeof(TCGTemp) * tcg_ctx->nb_temps);
+    assert(o % sizeof(TCGTemp) == 0);
+
+    return (void *)tcg_ctx + (uintptr_t)v;
+}
+#endif /* CONFIG_DEBUG_TCG */
+
 /* Return true if OP may appear in the opcode stream.
    Test the runtime variable that controls each opcode.  */
 bool tcg_op_supported(TCGOpcode op)
-- 
2.34.1

Make tcg_gen_callN a static function.  Create tcg_gen_call[0-7]
functions for use by helper-gen.h.inc.

Removes a multiplicty of calls to __stack_chk_fail, saving up
to 143kiB of .text space as measured on an x86_64 host.

Old     New Less    %Change
8888680	8741816	146864	1.65%	qemu-system-aarch64
5911832	5856152	55680	0.94%	qemu-system-riscv64
5816728	5767512	49216	0.85%	qemu-system-mips64
6707832	6659144	48688	0.73%	qemu-system-ppc64

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-gen.h | 40 ++++++++++++++---------------
 include/tcg/tcg.h         | 14 +++++++++-
 tcg/tcg.c                 | 54 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/include/exec/helper-gen.h b/include/exec/helper-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-gen.h
+++ b/include/exec/helper-gen.h
@@ -XXX,XX +XXX,XX @@
 extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl0(ret))        \
 {                                                                       \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 0, NULL);  \
+    tcg_gen_call0(&glue(helper_info_, name), dh_retvar(ret));           \
 }
 
 #define DEF_HELPER_FLAGS_1(name, flags, ret, t1)                        \
@@ -XXX,XX +XXX,XX @@ extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1))                                                 \
 {                                                                       \
-    TCGTemp *args[1] = { dh_arg(t1, 1) };                               \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 1, args);  \
+    tcg_gen_call1(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1));                                       \
 }
 
 #define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2)                    \
@@ -XXX,XX +XXX,XX @@ extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2))                             \
 {                                                                       \
-    TCGTemp *args[2] = { dh_arg(t1, 1), dh_arg(t2, 2) };                \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 2, args);  \
+    tcg_gen_call2(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1), dh_arg(t2, 2));                        \
 }
 
 #define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3)                \
@@ -XXX,XX +XXX,XX @@ extern TCGHelperInfo glue(helper_info_, name);                          \
 static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3))         \
 {                                                                       \
-    TCGTemp *args[3] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3) }; \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 3, args);  \
+    tcg_gen_call3(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3));         \
 }
 
 #define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4)            \
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2),                             \
     dh_arg_decl(t3, 3), dh_arg_decl(t4, 4))                             \
 {                                                                       \
-    TCGTemp *args[4] = { dh_arg(t1, 1), dh_arg(t2, 2),                  \
-                         dh_arg(t3, 3), dh_arg(t4, 4) };                \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 4, args);  \
+    tcg_gen_call4(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1), dh_arg(t2, 2),                         \
+                  dh_arg(t3, 3), dh_arg(t4, 4));                        \
 }
 
 #define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5)        \
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
     dh_arg_decl(t4, 4), dh_arg_decl(t5, 5))                             \
 {                                                                       \
-    TCGTemp *args[5] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
-                         dh_arg(t4, 4), dh_arg(t5, 5) };                \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 5, args);  \
+    tcg_gen_call5(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
+                  dh_arg(t4, 4), dh_arg(t5, 5));                        \
 }
 
 #define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6)    \
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t1, 1), dh_arg_decl(t2, 2), dh_arg_decl(t3, 3),         \
     dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6))         \
 {                                                                       \
-    TCGTemp *args[6] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
-                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6) }; \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 6, args);  \
+    tcg_gen_call6(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
+                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6));         \
 }
 
 #define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7)\
@@ -XXX,XX +XXX,XX @@ static inline void glue(gen_helper_, name)(dh_retvar_decl(ret)          \
     dh_arg_decl(t4, 4), dh_arg_decl(t5, 5), dh_arg_decl(t6, 6),         \
     dh_arg_decl(t7, 7))                                                 \
 {                                                                       \
-    TCGTemp *args[7] = { dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),   \
-                         dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),   \
-                         dh_arg(t7, 7) };                               \
-    tcg_gen_callN(&glue(helper_info_, name), dh_retvar(ret), 7, args);  \
+    tcg_gen_call7(&glue(helper_info_, name), dh_retvar(ret),            \
+                  dh_arg(t1, 1), dh_arg(t2, 2), dh_arg(t3, 3),          \
+                  dh_arg(t4, 4), dh_arg(t5, 5), dh_arg(t6, 6),          \
+                  dh_arg(t7, 7));                                       \
 }
 
 #include "helper.h"
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTargetOpDef {
 
 bool tcg_op_supported(TCGOpcode op);
 
-void tcg_gen_callN(TCGHelperInfo *, TCGTemp *ret, int nargs, TCGTemp **args);
+void tcg_gen_call0(TCGHelperInfo *, TCGTemp *ret);
+void tcg_gen_call1(TCGHelperInfo *, TCGTemp *ret, TCGTemp *);
+void tcg_gen_call2(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *);
+void tcg_gen_call3(TCGHelperInfo *, TCGTemp *ret, TCGTemp *,
+                   TCGTemp *, TCGTemp *);
+void tcg_gen_call4(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
+                   TCGTemp *, TCGTemp *);
+void tcg_gen_call5(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
+                   TCGTemp *, TCGTemp *, TCGTemp *);
+void tcg_gen_call6(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
+                   TCGTemp *, TCGTemp *, TCGTemp *, TCGTemp *);
+void tcg_gen_call7(TCGHelperInfo *, TCGTemp *ret, TCGTemp *, TCGTemp *,
+                   TCGTemp *, TCGTemp *, TCGTemp *, TCGTemp *, TCGTemp *);
 
 TCGOp *tcg_emit_op(TCGOpcode opc, unsigned nargs);
 void tcg_op_remove(TCGContext *s, TCGOp *op);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
 
 static TCGOp *tcg_op_alloc(TCGOpcode opc, unsigned nargs);
 
-void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, int nargs, TCGTemp **args)
+static void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, TCGTemp **args)
 {
     TCGv_i64 extend_free[MAX_CALL_IARGS];
     int n_extend = 0;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(TCGHelperInfo *info, TCGTemp *ret, int nargs, TCGTemp **args)
     }
 }
 
+void tcg_gen_call0(TCGHelperInfo *info, TCGTemp *ret)
+{
+    tcg_gen_callN(info, ret, NULL);
+}
+
+void tcg_gen_call1(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1)
+{
+    tcg_gen_callN(info, ret, &t1);
+}
+
+void tcg_gen_call2(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1, TCGTemp *t2)
+{
+    TCGTemp *args[2] = { t1, t2 };
+    tcg_gen_callN(info, ret, args);
+}
+
+void tcg_gen_call3(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
+                   TCGTemp *t2, TCGTemp *t3)
+{
+    TCGTemp *args[3] = { t1, t2, t3 };
+    tcg_gen_callN(info, ret, args);
+}
+
+void tcg_gen_call4(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
+                   TCGTemp *t2, TCGTemp *t3, TCGTemp *t4)
+{
+    TCGTemp *args[4] = { t1, t2, t3, t4 };
+    tcg_gen_callN(info, ret, args);
+}
+
+void tcg_gen_call5(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
+                   TCGTemp *t2, TCGTemp *t3, TCGTemp *t4, TCGTemp *t5)
+{
+    TCGTemp *args[5] = { t1, t2, t3, t4, t5 };
+    tcg_gen_callN(info, ret, args);
+}
+
+void tcg_gen_call6(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1, TCGTemp *t2,
+                   TCGTemp *t3, TCGTemp *t4, TCGTemp *t5, TCGTemp *t6)
+{
+    TCGTemp *args[6] = { t1, t2, t3, t4, t5, t6 };
+    tcg_gen_callN(info, ret, args);
+}
+
+void tcg_gen_call7(TCGHelperInfo *info, TCGTemp *ret, TCGTemp *t1,
+                   TCGTemp *t2, TCGTemp *t3, TCGTemp *t4,
+                   TCGTemp *t5, TCGTemp *t6, TCGTemp *t7)
+{
+    TCGTemp *args[7] = { t1, t2, t3, t4, t5, t6, t7 };
+    tcg_gen_callN(info, ret, args);
+}
+
 static void tcg_reg_alloc_start(TCGContext *s)
 {
     int i, n;
-- 
2.34.1

Create helper-gen-common.h without the target specific portion.
Use that in tcg-op-common.h.  Reorg headers in target/arm to
ensure that helper-gen.h is included before helper-info.c.inc.
All other targets are already correct in this regard.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 MAINTAINERS                      |   1 +
 include/exec/helper-gen-common.h |  18 ++++++
 include/exec/helper-gen.h        | 101 ++----------------------------
 include/tcg/tcg-op-common.h      |   2 +-
 include/exec/helper-gen.h.inc    | 102 +++++++++++++++++++++++++++++++
 target/arm/tcg/translate.c       |   8 +--
 6 files changed, 129 insertions(+), 103 deletions(-)
 create mode 100644 include/exec/helper-gen-common.h
 create mode 100644 include/exec/helper-gen.h.inc

Create helper-proto-common.h without the target specific portion.
Use that in tcg-op-common.h.  Include helper-proto.h in target/arm
and target/hexagon before helper-info.c.inc; all other targets are
already correct in this regard.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-proto-common.h | 18 ++++++++
 include/exec/helper-proto.h        | 73 ++++--------------------------
 include/tcg/tcg-op-common.h        |  2 +-
 include/exec/helper-proto.h.inc    | 68 ++++++++++++++++++++++++++++
 accel/tcg/cputlb.c                 |  3 +-
 accel/tcg/plugin-gen.c             |  2 +-
 accel/tcg/tcg-runtime-gvec.c       |  2 +-
 accel/tcg/tcg-runtime.c            |  2 +-
 target/arm/tcg/translate.c         |  1 +
 target/hexagon/translate.c         |  1 +
 10 files changed, 102 insertions(+), 70 deletions(-)
 create mode 100644 include/exec/helper-proto-common.h
 create mode 100644 include/exec/helper-proto.h.inc

diff --git a/include/exec/helper-proto-common.h b/include/exec/helper-proto-common.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/exec/helper-proto-common.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Helper file for declaring TCG helper functions.
+ * This one expands prototypes for the helper functions.
+ */
+
+#ifndef HELPER_PROTO_COMMON_H
+#define HELPER_PROTO_COMMON_H
+
+#define HELPER_H "accel/tcg/tcg-runtime.h"
+#include "exec/helper-proto.h.inc"
+#undef  HELPER_H
+
+#define HELPER_H "accel/tcg/plugin-helpers.h"
+#include "exec/helper-proto.h.inc"
+#undef  HELPER_H
+
+#endif /* HELPER_PROTO_COMMON_H */
diff --git a/include/exec/helper-proto.h b/include/exec/helper-proto.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-proto.h
+++ b/include/exec/helper-proto.h
@@ -XXX,XX +XXX,XX @@
-/* Helper file for declaring TCG helper functions.
-   This one expands prototypes for the helper functions.  */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Helper file for declaring TCG helper functions.
+ * This one expands prototypes for the helper functions.
+ */
 
 #ifndef HELPER_PROTO_H
 #define HELPER_PROTO_H
 
-#include "exec/helper-head.h"
+#include "exec/helper-proto-common.h"
 
-/*
- * Work around an issue with --enable-lto, in which GCC's ipa-split pass
- * decides to split out the noreturn code paths that raise an exception,
- * taking the __builtin_return_address() along into the new function,
- * where it no longer computes a value that returns to TCG generated code.
- * Despite the name, the noinline attribute affects splitter, so this
- * prevents the optimization in question.  Given that helpers should not
- * otherwise be called directly, this should have any other visible effect.
- *
- * See https://gitlab.com/qemu-project/qemu/-/issues/1454
- */
-#define DEF_HELPER_ATTR  __attribute__((noinline))
-
-#define DEF_HELPER_FLAGS_0(name, flags, ret) \
-dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
-                            dh_ctype(t3)) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4)) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5), \
-                            dh_ctype(t6)) DEF_HELPER_ATTR;
-
-#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
-dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
-                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
-                            dh_ctype(t7)) DEF_HELPER_ATTR;
-
-#define IN_HELPER_PROTO
-
-#include "helper.h"
-#include "accel/tcg/tcg-runtime.h"
-#include "accel/tcg/plugin-helpers.h"
-
-#undef IN_HELPER_PROTO
-
-#undef DEF_HELPER_FLAGS_0
-#undef DEF_HELPER_FLAGS_1
-#undef DEF_HELPER_FLAGS_2
-#undef DEF_HELPER_FLAGS_3
-#undef DEF_HELPER_FLAGS_4
-#undef DEF_HELPER_FLAGS_5
-#undef DEF_HELPER_FLAGS_6
-#undef DEF_HELPER_FLAGS_7
-#undef DEF_HELPER_ATTR
+#define HELPER_H "helper.h"
+#include "exec/helper-proto.h.inc"
+#undef  HELPER_H
 
 #endif /* HELPER_PROTO_H */
diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-common.h
+++ b/include/tcg/tcg-op-common.h
@@ -XXX,XX +XXX,XX @@
 #define TCG_TCG_OP_COMMON_H
 
 #include "tcg/tcg.h"
-#include "exec/helper-proto.h"
+#include "exec/helper-proto-common.h"
 #include "exec/helper-gen-common.h"
 
 /* Basic output routines.  Not for general consumption.  */
diff --git a/include/exec/helper-proto.h.inc b/include/exec/helper-proto.h.inc
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/exec/helper-proto.h.inc
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Helper file for declaring TCG helper functions.
+ * This one expands prototypes for the helper functions.
+ * Define HELPER_H for the header file to be expanded.
+ */
+
+#include "exec/helper-head.h"
+
+/*
+ * Work around an issue with --enable-lto, in which GCC's ipa-split pass
+ * decides to split out the noreturn code paths that raise an exception,
+ * taking the __builtin_return_address() along into the new function,
+ * where it no longer computes a value that returns to TCG generated code.
+ * Despite the name, the noinline attribute affects splitter, so this
+ * prevents the optimization in question.  Given that helpers should not
+ * otherwise be called directly, this should not have any other visible effect.
+ *
+ * See https://gitlab.com/qemu-project/qemu/-/issues/1454
+ */
+#define DEF_HELPER_ATTR  __attribute__((noinline))
+
+#define DEF_HELPER_FLAGS_0(name, flags, ret) \
+dh_ctype(ret) HELPER(name) (void) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_1(name, flags, ret, t1) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1)) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_2(name, flags, ret, t1, t2) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2)) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_3(name, flags, ret, t1, t2, t3) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), \
+                            dh_ctype(t3)) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_4(name, flags, ret, t1, t2, t3, t4) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4)) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_5(name, flags, ret, t1, t2, t3, t4, t5) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4), dh_ctype(t5)) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_6(name, flags, ret, t1, t2, t3, t4, t5, t6) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4), dh_ctype(t5), \
+                            dh_ctype(t6)) DEF_HELPER_ATTR;
+
+#define DEF_HELPER_FLAGS_7(name, flags, ret, t1, t2, t3, t4, t5, t6, t7) \
+dh_ctype(ret) HELPER(name) (dh_ctype(t1), dh_ctype(t2), dh_ctype(t3), \
+                            dh_ctype(t4), dh_ctype(t5), dh_ctype(t6), \
+                            dh_ctype(t7)) DEF_HELPER_ATTR;
+
+#define IN_HELPER_PROTO
+
+#include HELPER_H
+
+#undef IN_HELPER_PROTO
+
+#undef DEF_HELPER_FLAGS_0
+#undef DEF_HELPER_FLAGS_1
+#undef DEF_HELPER_FLAGS_2
+#undef DEF_HELPER_FLAGS_3
+#undef DEF_HELPER_FLAGS_4
+#undef DEF_HELPER_FLAGS_5
+#undef DEF_HELPER_FLAGS_6
+#undef DEF_HELPER_FLAGS_7
+#undef DEF_HELPER_ATTR
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg.h"
 #include "qemu/error-report.h"
 #include "exec/log.h"
-#include "exec/helper-proto.h"
+#include "exec/helper-proto-common.h"
 #include "qemu/atomic.h"
 #include "qemu/atomic128.h"
 #include "exec/translate-all.h"
@@ -XXX,XX +XXX,XX @@
 #endif
 #include "tcg/tcg-ldst.h"
 #include "tcg/oversized-guest.h"
-#include "exec/helper-proto.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/exec-all.h"
 #include "exec/plugin-gen.h"
 #include "exec/translator.h"
-#include "exec/helper-proto.h"
+#include "exec/helper-proto-common.h"
 
 #define HELPER_H  "accel/tcg/plugin-helpers.h"
 #include "exec/helper-info.c.inc"
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
 #include "cpu.h"
-#include "exec/helper-proto.h"
+#include "exec/helper-proto-common.h"
 #include "tcg/tcg-gvec-desc.h"
 
 
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
 #include "cpu.h"
-#include "exec/helper-proto.h"
+#include "exec/helper-proto-common.h"
 #include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
 #include "disas/disas.h"
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "translate.h"
 #include "translate-a32.h"
 #include "exec/gen-icount.h"
+#include "exec/helper-proto.h"
 
 #define HELPER_H "helper.h"
 #include "exec/helper-info.c.inc"
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
 #include "exec/helper-gen.h"
+#include "exec/helper-proto.h"
 #include "exec/cpu_ldst.h"
 #include "exec/log.h"
 #include "internal.h"
-- 
2.34.1

Fixes an assert in tcg_gen_code that we don't accidentally
eliminate an insn_start during optimization.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sh4/translate.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
 
     /* The entire region has been translated.  */
     ctx->envflags &= ~TB_FLAG_GUSA_MASK;
-    ctx->base.pc_next = pc_end;
-    ctx->base.num_insns += max_insns - 1;
-    return;
+    goto done;
 
  fail:
     qemu_log_mask(LOG_UNIMP, "Unrecognized gUSA sequence %08x-%08x\n",
@@ -XXX,XX +XXX,XX @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
        purposes of accounting within the TB.  We might as well report the
        entire region consumed via ctx->base.pc_next so that it's immediately
        available in the disassembly dump.  */
+
+ done:
     ctx->base.pc_next = pc_end;
     ctx->base.num_insns += max_insns - 1;
+
+    /*
+     * Emit insn_start to cover each of the insns in the region.
+     * This matches an assert in tcg.c making sure that we have
+     * tb->icount * insn_start.
+     */
+    for (i = 1; i < max_insns; ++i) {
+        tcg_gen_insn_start(pc + i * 2, ctx->envflags);
+    }
 }
 #endif
 
-- 
2.34.1

This will enable replacement of TARGET_INSN_START_WORDS in tcg.c.
Split out "tcg/insn-start-words.h" and use it in target/.

Reviewed-by: Anton Johansson <anjo@rev.ng>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/insn-start-words.h | 17 +++++++++++++++++
 include/tcg/tcg-op.h           |  8 ++++----
 include/tcg/tcg-opc.h          |  6 +++---
 include/tcg/tcg.h              |  9 ++-------
 accel/tcg/perf.c               |  8 ++++++--
 accel/tcg/translate-all.c      | 20 +++++++++++++-------
 target/i386/helper.c           |  2 +-
 target/openrisc/sys_helper.c   |  2 +-
 tcg/tcg.c                      | 16 +++++++++++-----
 9 files changed, 58 insertions(+), 30 deletions(-)
 create mode 100644 include/tcg/insn-start-words.h

diff --git a/include/tcg/insn-start-words.h b/include/tcg/insn-start-words.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/insn-start-words.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Define TARGET_INSN_START_WORDS
+ * Copyright (c) 2008 Fabrice Bellard
+ */
+
+#ifndef TARGET_INSN_START_WORDS
+
+#include "cpu.h"
+
+#ifndef TARGET_INSN_START_EXTRA_WORDS
+# define TARGET_INSN_START_WORDS 1
+#else
+# define TARGET_INSN_START_WORDS (1 + TARGET_INSN_START_EXTRA_WORDS)
+#endif
+
+#endif /* TARGET_INSN_START_WORDS */
diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@
 # error
 #endif
 
-#if TARGET_INSN_START_WORDS == 1
+#ifndef TARGET_INSN_START_EXTRA_WORDS
 static inline void tcg_gen_insn_start(target_ulong pc)
 {
     TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 64 / TCG_TARGET_REG_BITS);
     tcg_set_insn_start_param(op, 0, pc);
 }
-#elif TARGET_INSN_START_WORDS == 2
+#elif TARGET_INSN_START_EXTRA_WORDS == 1
 static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1)
 {
     TCGOp *op = tcg_emit_op(INDEX_op_insn_start, 2 * 64 / TCG_TARGET_REG_BITS);
     tcg_set_insn_start_param(op, 0, pc);
     tcg_set_insn_start_param(op, 1, a1);
 }
-#elif TARGET_INSN_START_WORDS == 3
+#elif TARGET_INSN_START_EXTRA_WORDS == 2
 static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
                                       target_ulong a2)
 {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1,
     tcg_set_insn_start_param(op, 2, a2);
 }
 #else
-# error "Unhandled number of operands to insn_start"
+#error Unhandled TARGET_INSN_START_EXTRA_WORDS value
 #endif
 
 #if TARGET_LONG_BITS == 32
diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(mulsh_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulsh_i64))
 
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
-/* QEMU specific */
-DEF(insn_start, 0, 0, DATA64_ARGS * TARGET_INSN_START_WORDS,
-    TCG_OPF_NOT_PRESENT)
+/* There are tcg_ctx->insn_start_words here, not just one. */
+DEF(insn_start, 0, 0, DATA64_ARGS, TCG_OPF_NOT_PRESENT)
+
 DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_v256             0
 #endif
 
-#ifndef TARGET_INSN_START_EXTRA_WORDS
-# define TARGET_INSN_START_WORDS 1
-#else
-# define TARGET_INSN_START_WORDS (1 + TARGET_INSN_START_EXTRA_WORDS)
-#endif
-
 typedef enum TCGOpcode {
 #define DEF(name, oargs, iargs, cargs, flags) INDEX_op_ ## name,
 #include "tcg/tcg-opc.h"
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     uint8_t page_bits;
     uint8_t tlb_dyn_max_bits;
 #endif
+    uint8_t insn_start_words;
 
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     TCGTemp *reg_to_temp[TCG_TARGET_NB_REGS];
 
     uint16_t gen_insn_end_off[TCG_MAX_INSNS];
-    uint64_t gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
+    uint64_t *gen_insn_data;
 
     /* Exit to translator on overflow. */
     sigjmp_buf jmp_trans;
diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/perf.c
+++ b/accel/tcg/perf.c
@@ -XXX,XX +XXX,XX @@ void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
                       const void *start)
 {
     struct debuginfo_query *q;
-    size_t insn;
+    size_t insn, start_words;
+    uint64_t *gen_insn_data;
 
     if (!perfmap && !jitdump) {
         return;
@@ -XXX,XX +XXX,XX @@ void perf_report_code(uint64_t guest_pc, TranslationBlock *tb,
     debuginfo_lock();
 
     /* Query debuginfo for each guest instruction. */
+    gen_insn_data = tcg_ctx->gen_insn_data;
+    start_words = tcg_ctx->insn_start_words;
+
     for (insn = 0; insn < tb->icount; insn++) {
         /* FIXME: This replicates the restore_state_to_opc() logic. */
-        q[insn].address = tcg_ctx->gen_insn_data[insn][0];
+        q[insn].address = gen_insn_data[insn * start_words + 0];
         if (tb_cflags(tb) & CF_PCREL) {
             q[insn].address |= (guest_pc & TARGET_PAGE_MASK);
         } else {
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 #include "tb-context.h"
 #include "internal.h"
 #include "perf.h"
+#include "tcg/insn-start-words.h"
 
 TBContext tb_ctx;
 
@@ -XXX,XX +XXX,XX @@ static int64_t decode_sleb128(const uint8_t **pp)
 static int encode_search(TranslationBlock *tb, uint8_t *block)
 {
     uint8_t *highwater = tcg_ctx->code_gen_highwater;
+    uint64_t *insn_data = tcg_ctx->gen_insn_data;
+    uint16_t *insn_end_off = tcg_ctx->gen_insn_end_off;
     uint8_t *p = block;
     int i, j, n;
 
     for (i = 0, n = tb->icount; i < n; ++i) {
-        uint64_t prev;
+        uint64_t prev, curr;
 
         for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
             if (i == 0) {
                 prev = (!(tb_cflags(tb) & CF_PCREL) && j == 0 ? tb->pc : 0);
             } else {
-                prev = tcg_ctx->gen_insn_data[i - 1][j];
+                prev = insn_data[(i - 1) * TARGET_INSN_START_WORDS + j];
             }
-            p = encode_sleb128(p, tcg_ctx->gen_insn_data[i][j] - prev);
+            curr = insn_data[i * TARGET_INSN_START_WORDS + j];
+            p = encode_sleb128(p, curr - prev);
         }
-        prev = (i == 0 ? 0 : tcg_ctx->gen_insn_end_off[i - 1]);
-        p = encode_sleb128(p, tcg_ctx->gen_insn_end_off[i] - prev);
+        prev = (i == 0 ? 0 : insn_end_off[i - 1]);
+        curr = insn_end_off[i];
+        p = encode_sleb128(p, curr - prev);
 
         /* Test for (pending) buffer overflow.  The assumption is that any
            one row beginning below the high water mark cannot overrun
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
     tcg_ctx->tlb_fast_offset =
         (int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
 #endif
+    tcg_ctx->insn_start_words = TARGET_INSN_START_WORDS;
 
  tb_overflow:
 
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
             fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
             fprintf(logfile,
                     "  -- guest addr 0x%016" PRIx64 " + tb prologue\n",
-                    tcg_ctx->gen_insn_data[insn][0]);
+                    tcg_ctx->gen_insn_data[insn * TARGET_INSN_START_WORDS]);
             chunk_start = tcg_ctx->gen_insn_end_off[insn];
             disas(logfile, tb->tc.ptr, chunk_start);
 
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
                 size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
                 if (chunk_end > chunk_start) {
                     fprintf(logfile, "  -- guest addr 0x%016" PRIx64 "\n",
-                            tcg_ctx->gen_insn_data[insn][0]);
+                            tcg_ctx->gen_insn_data[insn * TARGET_INSN_START_WORDS]);
                     disas(logfile, tb->tc.ptr + chunk_start,
                           chunk_end - chunk_start);
                     chunk_start = chunk_end;
diff --git a/target/i386/helper.c b/target/i386/helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -XXX,XX +XXX,XX @@
 #endif
 #include "qemu/log.h"
 #ifdef CONFIG_TCG
-#include "tcg/tcg.h"
+#include "tcg/insn-start-words.h"
 #endif
 
 void cpu_sync_avx_hflag(CPUX86State *env)
diff --git a/target/openrisc/sys_helper.c b/target/openrisc/sys_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/sys_helper.c
+++ b/target/openrisc/sys_helper.c
@@ -XXX,XX +XXX,XX @@
 #ifndef CONFIG_USER_ONLY
 #include "hw/boards.h"
 #endif
-#include "tcg/tcg.h"
+#include "tcg/insn-start-words.h"
 
 #define TO_SPR(group, number) (((group) << 11) + (number))
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
     tcg_debug_assert(s->tlb_fast_offset < 0);
     tcg_debug_assert(s->tlb_fast_offset >= MIN_TLB_MASK_TABLE_OFS);
 #endif
+
+    tcg_debug_assert(s->insn_start_words > 0);
 }
 
 static TCGTemp *tcg_temp_alloc(TCGContext *s)
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, FILE *f, bool have_prefs)
             nb_oargs = 0;
             col += ne_fprintf(f, "\n ----");
 
-            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
+            for (i = 0, k = s->insn_start_words; i < k; ++i) {
                 col += ne_fprintf(f, " %016" PRIx64,
                                   tcg_get_insn_start_param(op, i));
             }
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
 #ifdef CONFIG_PROFILER
     TCGProfile *prof = &s->prof;
 #endif
-    int i, num_insns;
+    int i, start_words, num_insns;
     TCGOp *op;
 
 #ifdef CONFIG_PROFILER
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     s->pool_labels = NULL;
 #endif
 
+    start_words = s->insn_start_words;
+    s->gen_insn_data =
+        tcg_malloc(sizeof(uint64_t) * s->gen_tb->icount * start_words);
+
     num_insns = -1;
     QTAILQ_FOREACH(op, &s->ops, link) {
         TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
                 assert(s->gen_insn_end_off[num_insns] == off);
             }
             num_insns++;
-            for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
-                s->gen_insn_data[num_insns][i] =
+            for (i = 0; i < start_words; ++i) {
+                s->gen_insn_data[num_insns * start_words + i] =
                     tcg_get_insn_start_param(op, i);
             }
             break;
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             return -2;
         }
     }
-    tcg_debug_assert(num_insns >= 0);
+    tcg_debug_assert(num_insns + 1 == s->gen_tb->icount);
     s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
 
     /* Generate TB finalization at the end of block */
-- 
2.34.1

This replaces of TCG_GUEST_DEFAULT_MO in tcg-op-ldst.c.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h         | 1 +
 accel/tcg/translate-all.c | 5 +++++
 tcg/tcg-op-ldst.c         | 4 +---
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     uint8_t tlb_dyn_max_bits;
 #endif
     uint8_t insn_start_words;
+    TCGBar guest_mo;
 
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
         (int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
 #endif
     tcg_ctx->insn_start_words = TARGET_INSN_START_WORDS;
+#ifdef TCG_GUEST_DEFAULT_MO
+    tcg_ctx->guest_mo = TCG_GUEST_DEFAULT_MO;
+#else
+    tcg_ctx->guest_mo = TCG_MO_ALL;
+#endif
 
  tb_overflow:
 
diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@ static void gen_ldst_i64(TCGOpcode opc, TCGv_i64 v, TCGTemp *addr, MemOpIdx oi)
 
 static void tcg_gen_req_mo(TCGBar type)
 {
-#ifdef TCG_GUEST_DEFAULT_MO
-    type &= TCG_GUEST_DEFAULT_MO;
-#endif
+    type &= tcg_ctx->guest_mo;
     type &= ~TCG_TARGET_DEFAULT_MO;
     if (type) {
         tcg_gen_mb(type | TCG_BAR_SC);
-- 
2.34.1

The replacement isn't ideal, as the raw count of bits
is not easily synced with exec/cpu-all.h, but it does
remove from tcg.h the target dependency on TARGET_PAGE_BITS_MIN
which is built into TLB_FLAGS_MASK.

Reviewed-by: Anton Johansson <anjo@rev.ng>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h |  3 +++
 include/tcg/tcg.h      |  4 ----
 tcg/tcg-op-ldst.c      | 18 ++++++++++++++++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
  *
  * Use TARGET_PAGE_BITS_MIN so that these bits are constant
  * when TARGET_PAGE_BITS_VARY is in effect.
+ *
+ * The count, if not the placement of these bits is known
+ * to tcg/tcg-op-ldst.c, check_max_alignment().
  */
 /* Zero if TLB entry is valid.  */
 #define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS_MIN - 1))
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline unsigned get_alignment_bits(MemOp memop)
         /* A specific alignment requirement.  */
         a = a >> MO_ASHIFT;
     }
-#if defined(CONFIG_SOFTMMU)
-    /* The requested alignment cannot overlap the TLB flags.  */
-    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << a) - 1)) == 0);
-#endif
     return a;
 }
 
diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg-internal.h"
 
 
-static inline MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st)
+static void check_max_alignment(unsigned a_bits)
+{
+#if defined(CONFIG_SOFTMMU)
+    /*
+     * The requested alignment cannot overlap the TLB flags.
+     * FIXME: Must keep the count up-to-date with "exec/cpu-all.h".
+     */
+    tcg_debug_assert(a_bits + 6 <= tcg_ctx->page_bits);
+#endif
+}
+
+static MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st)
 {
-    /* Trigger the asserts within as early as possible.  */
     unsigned a_bits = get_alignment_bits(op);
 
+    check_max_alignment(a_bits);
+
     /* Prefer MO_ALIGN+MO_XX over MO_ALIGN_XX+MO_XX */
     if (a_bits == (op & MO_SIZE)) {
         op = (op & ~MO_AMASK) | MO_ALIGN;
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_qemu_ld_i128_int(TCGv_i128 val, TCGTemp *addr,
     TCGv_i64 ext_addr = NULL;
     TCGOpcode opc;
 
+    check_max_alignment(get_alignment_bits(memop));
     tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
 
     /* TODO: For now, force 32-bit hosts to use the helper. */
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_qemu_st_i128_int(TCGv_i128 val, TCGTemp *addr,
     TCGv_i64 ext_addr = NULL;
     TCGOpcode opc;
 
+    check_max_alignment(get_alignment_bits(memop));
     tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
 
     /* TODO: For now, force 32-bit hosts to use the helper. */
-- 
2.34.1

Create tcg/tcg-op-gvec-common.h, moving everything that does not
concern TARGET_LONG_BITS.  Adjust tcg-op-gvec.c to use the new header.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op-gvec-common.h | 426 +++++++++++++++++++++++++++++
 include/tcg/tcg-op-gvec.h        | 444 +------------------------------
 tcg/tcg-op-gvec.c                |   2 +-
 3 files changed, 437 insertions(+), 435 deletions(-)
 create mode 100644 include/tcg/tcg-op-gvec-common.h

diff --git a/include/tcg/tcg-op-gvec-common.h b/include/tcg/tcg-op-gvec-common.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/tcg/tcg-op-gvec-common.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Target independent generic vector operation expansion
+ *
+ * Copyright (c) 2018 Linaro
+ */
+
+#ifndef TCG_TCG_OP_GVEC_COMMON_H
+#define TCG_TCG_OP_GVEC_COMMON_H
+
+/*
+ * "Generic" vectors.  All operands are given as offsets from ENV,
+ * and therefore cannot also be allocated via tcg_global_mem_new_*.
+ * OPRSZ is the byte size of the vector upon which the operation is performed.
+ * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
+ *
+ * All sizes must be 8 or any multiple of 16.
+ * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
+ * Operands may completely, but not partially, overlap.
+ */
+
+/* Expand a call to a gvec-style helper, with pointers to two vector
+   operands, and a descriptor (see tcg-gvec-desc.h).  */
+typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_2 *fn);
+
+/* Similarly, passing an extra data value.  */
+typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
+void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
+                         uint32_t oprsz, uint32_t maxsz, int32_t data,
+                         gen_helper_gvec_2i *fn);
+
+/* Similarly, passing an extra pointer (e.g. env or float_status).  */
+typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_2_ptr *fn);
+
+/* Similarly, with three vector operands.  */
+typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_3 *fn);
+
+/* Similarly, with four vector operands.  */
+typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                               TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_4 *fn);
+
+/* Similarly, with five vector operands.  */
+typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                               TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
+                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
+
+typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_3_ptr *fn);
+
+typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
+                        uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_4_ptr *fn);
+
+typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_5_ptr *fn);
+
+/* Expand a gvec operation.  Either inline or out-of-line depending on
+   the actual vector size and the operations supported by the host.  */
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_2 *fno;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The data argument to the out-of-line helper.  */
+    int32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load dest as a 2nd source operand.  */
+    bool load_dest;
+} GVecGen2;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, int64_t);
+    void (*fni4)(TCGv_i32, TCGv_i32, int32_t);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t);
+    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
+    gen_helper_gvec_2 *fno;
+    /* Expand out-of-line helper w/descriptor, data as argument.  */
+    gen_helper_gvec_2i *fnoi;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load dest as a 3rd source operand.  */
+    bool load_dest;
+} GVecGen2i;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_2i *fno;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The data argument to the out-of-line helper.  */
+    uint32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load scalar as 1st source operand.  */
+    bool scalar_first;
+} GVecGen2s;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_3 *fno;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The data argument to the out-of-line helper.  */
+    int32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load dest as a 3rd source operand.  */
+    bool load_dest;
+} GVecGen3;
+
+typedef struct {
+    /*
+     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
+     * non-NULL.
+     */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
+    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
+    gen_helper_gvec_3 *fno;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load dest as a 3rd source operand.  */
+    bool load_dest;
+} GVecGen3i;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_4 *fno;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The data argument to the out-of-line helper.  */
+    int32_t data;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Write aofs as a 2nd dest operand.  */
+    bool write_aofs;
+} GVecGen4;
+
+typedef struct {
+    /*
+     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
+     * non-NULL.
+     */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
+    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
+    gen_helper_gvec_4 *fno;
+    /* The optional opcodes, if any, utilized by .fniv.  */
+    const TCGOpcode *opt_opc;
+    /* The vector element size, if applicable.  */
+    uint8_t vece;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+} GVecGen4i;
+
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
+void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+                     uint32_t maxsz, int64_t c, const GVecGen2i *);
+void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *);
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
+void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                     uint32_t oprsz, uint32_t maxsz, int64_t c,
+                     const GVecGen3i *);
+void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
+void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
+                     uint32_t oprsz, uint32_t maxsz, int64_t c,
+                     const GVecGen4i *);
+
+/* Expand a specific vector operation.  */
+
+void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t c, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+
+/* Saturated arithmetic.  */
+void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+/* Min/max.  */
+void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
+                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      int64_t c, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
+                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
+                          uint32_t s, uint32_t m);
+void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
+                          uint32_t m, uint64_t imm);
+void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
+                          uint32_t m, TCGv_i32);
+void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
+                          uint32_t m, TCGv_i64);
+
+void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+
+/*
+ * Perform vector shift by vector element, modulo the element size.
+ * E.g.  D[i] = A[i] << (B[i] % (8 << vece)).
+ */
+void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
+                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
+                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
+                      uint32_t aofs, uint32_t bofs,
+                      uint32_t oprsz, uint32_t maxsz);
+
+/*
+ * Perform vector bit select: d = (b & a) | (c & ~a).
+ */
+void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
+                         uint32_t bofs, uint32_t cofs,
+                         uint32_t oprsz, uint32_t maxsz);
+
+/*
+ * 64-bit vector operations.  Use these when the register has been allocated
+ * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
+ * OPRSZ = MAXSZ = 8.
+ */
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
+void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
+
+/* 32-bit vector operations. */
+void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+
+void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
+
+void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
+
+#endif
diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * Generic vector operation expansion
+ * Target dependent generic vector operation expansion
  *
  * Copyright (c) 2018 Linaro
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef TCG_TCG_OP_GVEC_H
 #define TCG_TCG_OP_GVEC_H
 
-/*
- * "Generic" vectors.  All operands are given as offsets from ENV,
- * and therefore cannot also be allocated via tcg_global_mem_new_*.
- * OPRSZ is the byte size of the vector upon which the operation is performed.
- * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
- *
- * All sizes must be 8 or any multiple of 16.
- * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
- * Operands may completely, but not partially, overlap.
- */
+#include "tcg/tcg-op-gvec-common.h"
 
-/* Expand a call to a gvec-style helper, with pointers to two vector
-   operands, and a descriptor (see tcg-gvec-desc.h).  */
-typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
-                        uint32_t oprsz, uint32_t maxsz, int32_t data,
-                        gen_helper_gvec_2 *fn);
-
-/* Similarly, passing an extra data value.  */
-typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
-void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
-                         uint32_t oprsz, uint32_t maxsz, int32_t data,
-                         gen_helper_gvec_2i *fn);
-
-/* Similarly, passing an extra pointer (e.g. env or float_status).  */
-typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
-                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
-                        int32_t data, gen_helper_gvec_2_ptr *fn);
-
-/* Similarly, with three vector operands.  */
-typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                        uint32_t oprsz, uint32_t maxsz, int32_t data,
-                        gen_helper_gvec_3 *fn);
-
-/* Similarly, with four vector operands.  */
-typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
-                               TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
-                        int32_t data, gen_helper_gvec_4 *fn);
-
-/* Similarly, with five vector operands.  */
-typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
-                               TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
-                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
-
-typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
-                                   TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
-                        int32_t data, gen_helper_gvec_3_ptr *fn);
-
-typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
-                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
-                        uint32_t maxsz, int32_t data,
-                        gen_helper_gvec_4_ptr *fn);
-
-typedef void gen_helper_gvec_5_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
-                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
-void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                        uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
-                        uint32_t oprsz, uint32_t maxsz, int32_t data,
-                        gen_helper_gvec_5_ptr *fn);
-
-/* Expand a gvec operation.  Either inline or out-of-line depending on
-   the actual vector size and the operations supported by the host.  */
-typedef struct {
-    /* Expand inline as a 64-bit or 32-bit integer.
-       Only one of these will be non-NULL.  */
-    void (*fni8)(TCGv_i64, TCGv_i64);
-    void (*fni4)(TCGv_i32, TCGv_i32);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
-    /* Expand out-of-line helper w/descriptor.  */
-    gen_helper_gvec_2 *fno;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The data argument to the out-of-line helper.  */
-    int32_t data;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-    /* Load dest as a 2nd source operand.  */
-    bool load_dest;
-} GVecGen2;
-
-typedef struct {
-    /* Expand inline as a 64-bit or 32-bit integer.
-       Only one of these will be non-NULL.  */
-    void (*fni8)(TCGv_i64, TCGv_i64, int64_t);
-    void (*fni4)(TCGv_i32, TCGv_i32, int32_t);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t);
-    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
-    gen_helper_gvec_2 *fno;
-    /* Expand out-of-line helper w/descriptor, data as argument.  */
-    gen_helper_gvec_2i *fnoi;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-    /* Load dest as a 3rd source operand.  */
-    bool load_dest;
-} GVecGen2i;
-
-typedef struct {
-    /* Expand inline as a 64-bit or 32-bit integer.
-       Only one of these will be non-NULL.  */
-    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
-    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
-    /* Expand out-of-line helper w/descriptor.  */
-    gen_helper_gvec_2i *fno;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The data argument to the out-of-line helper.  */
-    uint32_t data;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-    /* Load scalar as 1st source operand.  */
-    bool scalar_first;
-} GVecGen2s;
-
-typedef struct {
-    /* Expand inline as a 64-bit or 32-bit integer.
-       Only one of these will be non-NULL.  */
-    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
-    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
-    /* Expand out-of-line helper w/descriptor.  */
-    gen_helper_gvec_3 *fno;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The data argument to the out-of-line helper.  */
-    int32_t data;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-    /* Load dest as a 3rd source operand.  */
-    bool load_dest;
-} GVecGen3;
-
-typedef struct {
-    /*
-     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
-     * non-NULL.
-     */
-    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
-    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
-    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
-    gen_helper_gvec_3 *fno;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-    /* Load dest as a 3rd source operand.  */
-    bool load_dest;
-} GVecGen3i;
-
-typedef struct {
-    /* Expand inline as a 64-bit or 32-bit integer.
-       Only one of these will be non-NULL.  */
-    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
-    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
-    /* Expand out-of-line helper w/descriptor.  */
-    gen_helper_gvec_4 *fno;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The data argument to the out-of-line helper.  */
-    int32_t data;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-    /* Write aofs as a 2nd dest operand.  */
-    bool write_aofs;
-} GVecGen4;
-
-typedef struct {
-    /*
-     * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
-     * non-NULL.
-     */
-    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
-    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
-    /* Expand inline with a host vector type.  */
-    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
-    /* Expand out-of-line helper w/descriptor, data in descriptor.  */
-    gen_helper_gvec_4 *fno;
-    /* The optional opcodes, if any, utilized by .fniv.  */
-    const TCGOpcode *opt_opc;
-    /* The vector element size, if applicable.  */
-    uint8_t vece;
-    /* Prefer i64 to v64.  */
-    bool prefer_i64;
-} GVecGen4i;
-
-void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
-                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
-void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-                     uint32_t maxsz, int64_t c, const GVecGen2i *);
-void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-                     uint32_t maxsz, TCGv_i64 c, const GVecGen2s *);
-void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
-void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
-                     uint32_t oprsz, uint32_t maxsz, int64_t c,
-                     const GVecGen3i *);
-void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
-                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
-void tcg_gen_gvec_4i(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
-                     uint32_t oprsz, uint32_t maxsz, int64_t c,
-                     const GVecGen4i *);
-
-/* Expand a specific vector operation.  */
-
-void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t c, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-
-/* Saturated arithmetic.  */
-void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-
-/* Min/max.  */
-void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
-                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      int64_t c, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_andcs(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
-                      TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
-                          uint32_t s, uint32_t m);
-void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
-                          uint32_t m, uint64_t imm);
-void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
-                          uint32_t m, TCGv_i32);
-void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
-                          uint32_t m, TCGv_i64);
-
-#if TARGET_LONG_BITS == 64
-# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
-#else
-# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
+#ifndef TARGET_LONG_BITS
+#error must include QEMU headers
 #endif
 
-void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t shift, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_rotrs(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
-
-/*
- * Perform vector shift by vector element, modulo the element size.
- * E.g.  D[i] = A[i] << (B[i] % (8 << vece)).
- */
-void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
-
-void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
-                      uint32_t aofs, uint32_t bofs,
-                      uint32_t oprsz, uint32_t maxsz);
-
-/*
- * Perform vector bit select: d = (b & a) | (c & ~a).
- */
-void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
-                         uint32_t bofs, uint32_t cofs,
-                         uint32_t oprsz, uint32_t maxsz);
-
-/*
- * 64-bit vector operations.  Use these when the register has been allocated
- * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
- * OPRSZ = MAXSZ = 8.
- */
-
-void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
-void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
-void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
-
-void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-
-void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
-
-void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
-void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
-void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
-void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
-void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
-void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
-void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
-void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c);
-
-/* 32-bit vector operations. */
-void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
-void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
-
-void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
-void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
-
-void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
-void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
-void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
-void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
-void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
-void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
-
 #if TARGET_LONG_BITS == 64
+#define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
 #define tcg_gen_vec_add8_tl  tcg_gen_vec_add8_i64
 #define tcg_gen_vec_sub8_tl  tcg_gen_vec_sub8_i64
 #define tcg_gen_vec_add16_tl tcg_gen_vec_add16_i64
@@ -XXX,XX +XXX,XX @@ void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 #define tcg_gen_vec_shl16i_tl tcg_gen_vec_shl16i_i64
 #define tcg_gen_vec_shr16i_tl tcg_gen_vec_shr16i_i64
 #define tcg_gen_vec_sar16i_tl tcg_gen_vec_sar16i_i64
-
-#else
+#elif TARGET_LONG_BITS == 32
+#define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
 #define tcg_gen_vec_add8_tl  tcg_gen_vec_add8_i32
 #define tcg_gen_vec_sub8_tl  tcg_gen_vec_sub8_i32
 #define tcg_gen_vec_add16_tl tcg_gen_vec_add16_i32
@@ -XXX,XX +XXX,XX @@ void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t);
 #define tcg_gen_vec_shl16i_tl tcg_gen_vec_shl16i_i32
 #define tcg_gen_vec_shr16i_tl tcg_gen_vec_shr16i_i32
 #define tcg_gen_vec_sar16i_tl tcg_gen_vec_sar16i_i32
+#else
+# error
 #endif
 
 #endif
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
 #include "tcg/tcg-op-common.h"
-#include "tcg/tcg-op-gvec.h"
+#include "tcg/tcg-op-gvec-common.h"
 #include "tcg/tcg-gvec-desc.h"
 
 #define MAX_UNROLL  4
-- 
2.34.1

From this remove, it's no longer clear what this is attempting
to protect.  The last time a use of this define was added to
the source tree, as opposed to merely moved around, was 2008.
There have been many cleanups since that time and this is
no longer required for the build to succeed.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/cpu.h          | 2 --
 target/sparc/cpu.h        | 2 --
 accel/tcg/translate-all.c | 1 -
 tcg/tcg.c                 | 6 ------
 4 files changed, 11 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ void ppc_store_msr(CPUPPCState *env, target_ulong value);
 void ppc_cpu_list(void);
 
 /* Time-base and decrementer management */
-#ifndef NO_CPU_IO_DEFS
 uint64_t cpu_ppc_load_tbl(CPUPPCState *env);
 uint32_t cpu_ppc_load_tbu(CPUPPCState *env);
 void cpu_ppc_store_tbu(CPUPPCState *env, uint32_t value);
@@ -XXX,XX +XXX,XX @@ int ppcemb_tlb_check(CPUPPCState *env, ppcemb_tlb_t *tlb,
 hwaddr booke206_tlb_to_page_size(CPUPPCState *env,
                                         ppcmas_tlb_t *tlb);
 #endif
-#endif
 
 void ppc_store_fpscr(CPUPPCState *env, target_ulong val);
 void helper_hfscr_facility_check(CPUPPCState *env, uint32_t bit,
diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void sparc_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                               uintptr_t retaddr);
 G_NORETURN void cpu_raise_exception_ra(CPUSPARCState *, int, uintptr_t);
 
-#ifndef NO_CPU_IO_DEFS
 /* cpu_init.c */
 void cpu_sparc_set_id(CPUSPARCState *env, unsigned int cpu);
 void sparc_cpu_list(void);
@@ -XXX,XX +XXX,XX @@ static inline int tlb_compare_context(const SparcTLBEntry *tlb,
     return compare_masked(context, tlb->tag, MMU_CONTEXT_MASK);
 }
 
-#endif
 #endif
 
 /* cpu-exec.c */
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@
 
 #include "qemu/osdep.h"
 
-#define NO_CPU_IO_DEFS
 #include "trace.h"
 #include "disas/disas.h"
 #include "exec/exec-all.h"
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/cacheflush.h"
 #include "qemu/cacheinfo.h"
 #include "qemu/timer.h"
-
-/* Note: the long term plan is to reduce the dependencies on the QEMU
-   CPU definitions. Currently they are used for qemu_ld/st
-   instructions */
-#define NO_CPU_IO_DEFS
-
 #include "exec/exec-all.h"
 #include "exec/tlb-common.h"
 #include "tcg/tcg-op-common.h"
-- 
2.34.1

This makes TranslationBlock agnostic to the address size of the guest.
Use vaddr for pc, since that's always a virtual address.
Use uint64_t for cs_base, since usage varies between guests.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 4 ++--
 accel/tcg/cpu-exec.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      * Unwind information is taken as offsets from the page, to be
      * deposited into the "current" PC.
      */
-    target_ulong pc;
+    vaddr pc;
 
     /*
      * Target-specific data associated with the TranslationBlock, e.g.:
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      * s390x: instruction data for EXECUTE,
      * sparc: the next pc of the instruction queue (for delay slots).
      */
-    target_ulong cs_base;
+    uint64_t cs_base;
 
     uint32_t flags; /* flags defining in which context the code was generated */
     uint32_t cflags;    /* compile flags */
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ static void log_cpu_exec(target_ulong pc, CPUState *cpu,
 {
     if (qemu_log_in_addr_range(pc)) {
         qemu_log_mask(CPU_LOG_EXEC,
-                      "Trace %d: %p [" TARGET_FMT_lx
+                      "Trace %d: %p [%08" PRIx64
                       "/" TARGET_FMT_lx "/%08x/%08x] %s\n",
                       cpu->cpu_index, tb->tc.ptr, tb->cs_base, pc,
                       tb->flags, tb->cflags, lookup_symbol(pc));
-- 
2.34.1

This is all that is required by tcg/ from exec-all.h.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h          | 132 +--------------------------
 include/exec/translation-block.h | 149 +++++++++++++++++++++++++++++++
 tcg/tcg-op-ldst.c                |   2 +-
 3 files changed, 151 insertions(+), 132 deletions(-)
 create mode 100644 include/exec/translation-block.h

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@
 #ifdef CONFIG_TCG
 #include "exec/cpu_ldst.h"
 #endif
-#include "qemu/interval-tree.h"
+#include "exec/translation-block.h"
 #include "qemu/clang-tsa.h"
 
-/* Page tracking code uses ram addresses in system mode, and virtual
-   addresses in userspace mode.  Define tb_page_addr_t to be an appropriate
-   type.  */
-#if defined(CONFIG_USER_ONLY)
-typedef vaddr tb_page_addr_t;
-#define TB_PAGE_ADDR_FMT "%" VADDR_PRIx
-#else
-typedef ram_addr_t tb_page_addr_t;
-#define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
-#endif
-
 /**
  * cpu_unwind_state_data:
  * @cpu: the cpu context
@@ -XXX,XX +XXX,XX @@ int probe_access_full(CPUArchState *env, target_ulong addr, int size,
                       CPUTLBEntryFull **pfull, uintptr_t retaddr);
 #endif
 
-#define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */
-
 /* Estimated block size for TB allocation.  */
 /* ??? The following is based on a 2015 survey of x86_64 host output.
    Better would seem to be some sort of dynamically sized TB array,
@@ -XXX,XX +XXX,XX @@ int probe_access_full(CPUArchState *env, target_ulong addr, int size,
 #define CODE_GEN_AVG_BLOCK_SIZE 150
 #endif
 
-/*
- * Translation Cache-related fields of a TB.
- * This struct exists just for convenience; we keep track of TB's in a binary
- * search tree, and the only fields needed to compare TB's in the tree are
- * @ptr and @size.
- * Note: the address of search data can be obtained by adding @size to @ptr.
- */
-struct tb_tc {
-    const void *ptr;    /* pointer to the translated code */
-    size_t size;
-};
-
-struct TranslationBlock {
-    /*
-     * Guest PC corresponding to this block.  This must be the true
-     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
-     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
-     * privilege, must store those bits elsewhere.
-     *
-     * If CF_PCREL, the opcodes for the TranslationBlock are written
-     * such that the TB is associated only with the physical page and
-     * may be run in any virtual address context.  In this case, PC
-     * must always be taken from ENV in a target-specific manner.
-     * Unwind information is taken as offsets from the page, to be
-     * deposited into the "current" PC.
-     */
-    vaddr pc;
-
-    /*
-     * Target-specific data associated with the TranslationBlock, e.g.:
-     * x86: the original user, the Code Segment virtual base,
-     * arm: an extension of tb->flags,
-     * s390x: instruction data for EXECUTE,
-     * sparc: the next pc of the instruction queue (for delay slots).
-     */
-    uint64_t cs_base;
-
-    uint32_t flags; /* flags defining in which context the code was generated */
-    uint32_t cflags;    /* compile flags */
-
-/* Note that TCG_MAX_INSNS is 512; we validate this match elsewhere. */
-#define CF_COUNT_MASK    0x000001ff
-#define CF_NO_GOTO_TB    0x00000200 /* Do not chain with goto_tb */
-#define CF_NO_GOTO_PTR   0x00000400 /* Do not chain with goto_ptr */
-#define CF_SINGLE_STEP   0x00000800 /* gdbstub single-step in effect */
-#define CF_LAST_IO       0x00008000 /* Last insn may be an IO access.  */
-#define CF_MEMI_ONLY     0x00010000 /* Only instrument memory ops */
-#define CF_USE_ICOUNT    0x00020000
-#define CF_INVALID       0x00040000 /* TB is stale. Set with @jmp_lock held */
-#define CF_PARALLEL      0x00080000 /* Generate code for a parallel context */
-#define CF_NOIRQ         0x00100000 /* Generate an uninterruptible TB */
-#define CF_PCREL         0x00200000 /* Opcodes in TB are PC-relative */
-#define CF_CLUSTER_MASK  0xff000000 /* Top 8 bits are cluster ID */
-#define CF_CLUSTER_SHIFT 24
-
-    /*
-     * Above fields used for comparing
-     */
-
-    /* size of target code for this block (1 <= size <= TARGET_PAGE_SIZE) */
-    uint16_t size;
-    uint16_t icount;
-
-    struct tb_tc tc;
-
-    /*
-     * Track tb_page_addr_t intervals that intersect this TB.
-     * For user-only, the virtual addresses are always contiguous,
-     * and we use a unified interval tree.  For system, we use a
-     * linked list headed in each PageDesc.  Within the list, the lsb
-     * of the previous pointer tells the index of page_next[], and the
-     * list is protected by the PageDesc lock(s).
-     */
-#ifdef CONFIG_USER_ONLY
-    IntervalTreeNode itree;
-#else
-    uintptr_t page_next[2];
-    tb_page_addr_t page_addr[2];
-#endif
-
-    /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
-    QemuSpin jmp_lock;
-
-    /* The following data are used to directly call another TB from
-     * the code of this one. This can be done either by emitting direct or
-     * indirect native jump instructions. These jumps are reset so that the TB
-     * just continues its execution. The TB can be linked to another one by
-     * setting one of the jump targets (or patching the jump instruction). Only
-     * two of such jumps are supported.
-     */
-#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
-    uint16_t jmp_reset_offset[2]; /* offset of original jump target */
-    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
-    uintptr_t jmp_target_addr[2]; /* target address */
-
-    /*
-     * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
-     * Each TB can have two outgoing jumps, and therefore can participate
-     * in two lists. The list entries are kept in jmp_list_next[2]. The least
-     * significant bit (LSB) of the pointers in these lists is used to encode
-     * which of the two list entries is to be used in the pointed TB.
-     *
-     * List traversals are protected by jmp_lock. The destination TB of each
-     * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock
-     * can be acquired from any origin TB.
-     *
-     * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is
-     * being invalidated, so that no further outgoing jumps from it can be set.
-     *
-     * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained
-     * to a destination TB that has CF_INVALID set.
-     */
-    uintptr_t jmp_list_head;
-    uintptr_t jmp_list_next[2];
-    uintptr_t jmp_dest[2];
-};
-
 /* Hide the qatomic_read to make code a little easier on the eyes */
 static inline uint32_t tb_cflags(const TranslationBlock *tb)
 {
diff --git a/include/exec/translation-block.h b/include/exec/translation-block.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/exec/translation-block.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/*
+ * Definition of TranslationBlock.
+ *  Copyright (c) 2003 Fabrice Bellard
+ */
+
+#ifndef EXEC_TRANSLATION_BLOCK_H
+#define EXEC_TRANSLATION_BLOCK_H
+
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+#include "qemu/interval-tree.h"
+#include "exec/cpu-common.h"
+#include "exec/target_page.h"
+
+/*
+ * Page tracking code uses ram addresses in system mode, and virtual
+ * addresses in userspace mode.  Define tb_page_addr_t to be an
+ * appropriate type.
+ */
+#if defined(CONFIG_USER_ONLY)
+typedef vaddr tb_page_addr_t;
+#define TB_PAGE_ADDR_FMT "%" VADDR_PRIx
+#else
+typedef ram_addr_t tb_page_addr_t;
+#define TB_PAGE_ADDR_FMT RAM_ADDR_FMT
+#endif
+
+/*
+ * Translation Cache-related fields of a TB.
+ * This struct exists just for convenience; we keep track of TB's in a binary
+ * search tree, and the only fields needed to compare TB's in the tree are
+ * @ptr and @size.
+ * Note: the address of search data can be obtained by adding @size to @ptr.
+ */
+struct tb_tc {
+    const void *ptr;    /* pointer to the translated code */
+    size_t size;
+};
+
+struct TranslationBlock {
+    /*
+     * Guest PC corresponding to this block.  This must be the true
+     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
+     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
+     * privilege, must store those bits elsewhere.
+     *
+     * If CF_PCREL, the opcodes for the TranslationBlock are written
+     * such that the TB is associated only with the physical page and
+     * may be run in any virtual address context.  In this case, PC
+     * must always be taken from ENV in a target-specific manner.
+     * Unwind information is taken as offsets from the page, to be
+     * deposited into the "current" PC.
+     */
+    vaddr pc;
+
+    /*
+     * Target-specific data associated with the TranslationBlock, e.g.:
+     * x86: the original user, the Code Segment virtual base,
+     * arm: an extension of tb->flags,
+     * s390x: instruction data for EXECUTE,
+     * sparc: the next pc of the instruction queue (for delay slots).
+     */
+    uint64_t cs_base;
+
+    uint32_t flags; /* flags defining in which context the code was generated */
+    uint32_t cflags;    /* compile flags */
+
+/* Note that TCG_MAX_INSNS is 512; we validate this match elsewhere. */
+#define CF_COUNT_MASK    0x000001ff
+#define CF_NO_GOTO_TB    0x00000200 /* Do not chain with goto_tb */
+#define CF_NO_GOTO_PTR   0x00000400 /* Do not chain with goto_ptr */
+#define CF_SINGLE_STEP   0x00000800 /* gdbstub single-step in effect */
+#define CF_LAST_IO       0x00008000 /* Last insn may be an IO access.  */
+#define CF_MEMI_ONLY     0x00010000 /* Only instrument memory ops */
+#define CF_USE_ICOUNT    0x00020000
+#define CF_INVALID       0x00040000 /* TB is stale. Set with @jmp_lock held */
+#define CF_PARALLEL      0x00080000 /* Generate code for a parallel context */
+#define CF_NOIRQ         0x00100000 /* Generate an uninterruptible TB */
+#define CF_PCREL         0x00200000 /* Opcodes in TB are PC-relative */
+#define CF_CLUSTER_MASK  0xff000000 /* Top 8 bits are cluster ID */
+#define CF_CLUSTER_SHIFT 24
+
+    /*
+     * Above fields used for comparing
+     */
+
+    /* size of target code for this block (1 <= size <= TARGET_PAGE_SIZE) */
+    uint16_t size;
+    uint16_t icount;
+
+    struct tb_tc tc;
+
+    /*
+     * Track tb_page_addr_t intervals that intersect this TB.
+     * For user-only, the virtual addresses are always contiguous,
+     * and we use a unified interval tree.  For system, we use a
+     * linked list headed in each PageDesc.  Within the list, the lsb
+     * of the previous pointer tells the index of page_next[], and the
+     * list is protected by the PageDesc lock(s).
+     */
+#ifdef CONFIG_USER_ONLY
+    IntervalTreeNode itree;
+#else
+    uintptr_t page_next[2];
+    tb_page_addr_t page_addr[2];
+#endif
+
+    /* jmp_lock placed here to fill a 4-byte hole. Its documentation is below */
+    QemuSpin jmp_lock;
+
+    /* The following data are used to directly call another TB from
+     * the code of this one. This can be done either by emitting direct or
+     * indirect native jump instructions. These jumps are reset so that the TB
+     * just continues its execution. The TB can be linked to another one by
+     * setting one of the jump targets (or patching the jump instruction). Only
+     * two of such jumps are supported.
+     */
+#define TB_JMP_OFFSET_INVALID 0xffff /* indicates no jump generated */
+    uint16_t jmp_reset_offset[2]; /* offset of original jump target */
+    uint16_t jmp_insn_offset[2];  /* offset of direct jump insn */
+    uintptr_t jmp_target_addr[2]; /* target address */
+
+    /*
+     * Each TB has a NULL-terminated list (jmp_list_head) of incoming jumps.
+     * Each TB can have two outgoing jumps, and therefore can participate
+     * in two lists. The list entries are kept in jmp_list_next[2]. The least
+     * significant bit (LSB) of the pointers in these lists is used to encode
+     * which of the two list entries is to be used in the pointed TB.
+     *
+     * List traversals are protected by jmp_lock. The destination TB of each
+     * outgoing jump is kept in jmp_dest[] so that the appropriate jmp_lock
+     * can be acquired from any origin TB.
+     *
+     * jmp_dest[] are tagged pointers as well. The LSB is set when the TB is
+     * being invalidated, so that no further outgoing jumps from it can be set.
+     *
+     * jmp_lock also protects the CF_INVALID cflag; a jump must not be chained
+     * to a destination TB that has CF_INVALID set.
+     */
+    uintptr_t jmp_list_head;
+    uintptr_t jmp_list_next[2];
+    uintptr_t jmp_dest[2];
+};
+
+/* The alignment given to TranslationBlock during allocation. */
+#define CODE_GEN_ALIGN  16
+
+#endif /* EXEC_TRANSLATION_BLOCK_H */
diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-ldst.c
+++ b/tcg/tcg-op-ldst.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
 #include "tcg/tcg-op-common.h"
 #include "tcg/tcg-mo.h"
+#include "exec/translation-block.h"
 #include "exec/plugin-gen.h"
 #include "tcg-internal.h"
 
-- 
2.34.1

The last use was removed with 2ac01d6dafab.

Fixes: 2ac01d6dafab ("translate-all: use a binary search tree to track TBs in TBContext")
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ int probe_access_full(CPUArchState *env, target_ulong addr, int size,
                       CPUTLBEntryFull **pfull, uintptr_t retaddr);
 #endif
 
-/* Estimated block size for TB allocation.  */
-/* ??? The following is based on a 2015 survey of x86_64 host output.
-   Better would seem to be some sort of dynamically sized TB array,
-   adapting to the block sizes actually being produced.  */
-#if defined(CONFIG_SOFTMMU)
-#define CODE_GEN_AVG_BLOCK_SIZE 400
-#else
-#define CODE_GEN_AVG_BLOCK_SIZE 150
-#endif
-
 /* Hide the qatomic_read to make code a little easier on the eyes */
 static inline uint32_t tb_cflags(const TranslationBlock *tb)
 {
-- 
2.34.1

The only usage of gen_tb_start and gen_tb_end are here.
Move the static icount_start_insn variable into a local
within translator_loop.  Simplify the two subroutines
by passing in the existing local cflags variable.

Leave only the declaration of gen_io_start in gen-icount.h.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/gen-icount.h | 79 +------------------------------------
 accel/tcg/translator.c    | 83 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 80 deletions(-)

diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -XXX,XX +XXX,XX @@
 #ifndef GEN_ICOUNT_H
 #define GEN_ICOUNT_H
 
-#include "exec/exec-all.h"
-
-/* Helpers for instruction counting code generation.  */
-
-static TCGOp *icount_start_insn;
-
-static inline void gen_io_start(void)
-{
-    tcg_gen_st_i32(tcg_constant_i32(1), cpu_env,
-                   offsetof(ArchCPU, parent_obj.can_do_io) -
-                   offsetof(ArchCPU, env));
-}
-
-static inline void gen_tb_start(const TranslationBlock *tb)
-{
-    TCGv_i32 count = tcg_temp_new_i32();
-
-    tcg_gen_ld_i32(count, cpu_env,
-                   offsetof(ArchCPU, neg.icount_decr.u32) -
-                   offsetof(ArchCPU, env));
-
-    if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        /*
-         * We emit a sub with a dummy immediate argument. Keep the insn index
-         * of the sub so that we later (when we know the actual insn count)
-         * can update the argument with the actual insn count.
-         */
-        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
-        icount_start_insn = tcg_last_op();
-    }
-
-    /*
-     * Emit the check against icount_decr.u32 to see if we should exit
-     * unless we suppress the check with CF_NOIRQ. If we are using
-     * icount and have suppressed interruption the higher level code
-     * should have ensured we don't run more instructions than the
-     * budget.
-     */
-    if (tb_cflags(tb) & CF_NOIRQ) {
-        tcg_ctx->exitreq_label = NULL;
-    } else {
-        tcg_ctx->exitreq_label = gen_new_label();
-        tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
-    }
-
-    if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        tcg_gen_st16_i32(count, cpu_env,
-                         offsetof(ArchCPU, neg.icount_decr.u16.low) -
-                         offsetof(ArchCPU, env));
-        /*
-         * cpu->can_do_io is cleared automatically here at the beginning of
-         * each translation block.  The cost is minimal and only paid for
-         * -icount, plus it would be very easy to forget doing it in the
-         * translator. Doing it here means we don't need a gen_io_end() to
-         * go with gen_io_start().
-         */
-        tcg_gen_st_i32(tcg_constant_i32(0), cpu_env,
-                       offsetof(ArchCPU, parent_obj.can_do_io) -
-                       offsetof(ArchCPU, env));
-    }
-}
-
-static inline void gen_tb_end(const TranslationBlock *tb, int num_insns)
-{
-    if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        /*
-         * Update the num_insn immediate parameter now that we know
-         * the actual insn count.
-         */
-        tcg_set_insn_param(icount_start_insn, 2,
-                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
-    }
-
-    if (tcg_ctx->exitreq_label) {
-        gen_set_label(tcg_ctx->exitreq_label);
-        tcg_gen_exit_tb(tb, TB_EXIT_REQUESTED);
-    }
-}
+void gen_io_start(void);
 
 #endif
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/plugin-gen.h"
 #include "exec/replay-core.h"
 
+
+void gen_io_start(void)
+{
+    tcg_gen_st_i32(tcg_constant_i32(1), cpu_env,
+                   offsetof(ArchCPU, parent_obj.can_do_io) -
+                   offsetof(ArchCPU, env));
+}
+
+static TCGOp *gen_tb_start(uint32_t cflags)
+{
+    TCGv_i32 count = tcg_temp_new_i32();
+    TCGOp *icount_start_insn = NULL;
+
+    tcg_gen_ld_i32(count, cpu_env,
+                   offsetof(ArchCPU, neg.icount_decr.u32) -
+                   offsetof(ArchCPU, env));
+
+    if (cflags & CF_USE_ICOUNT) {
+        /*
+         * We emit a sub with a dummy immediate argument. Keep the insn index
+         * of the sub so that we later (when we know the actual insn count)
+         * can update the argument with the actual insn count.
+         */
+        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
+        icount_start_insn = tcg_last_op();
+    }
+
+    /*
+     * Emit the check against icount_decr.u32 to see if we should exit
+     * unless we suppress the check with CF_NOIRQ. If we are using
+     * icount and have suppressed interruption the higher level code
+     * should have ensured we don't run more instructions than the
+     * budget.
+     */
+    if (cflags & CF_NOIRQ) {
+        tcg_ctx->exitreq_label = NULL;
+    } else {
+        tcg_ctx->exitreq_label = gen_new_label();
+        tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
+    }
+
+    if (cflags & CF_USE_ICOUNT) {
+        tcg_gen_st16_i32(count, cpu_env,
+                         offsetof(ArchCPU, neg.icount_decr.u16.low) -
+                         offsetof(ArchCPU, env));
+        /*
+         * cpu->can_do_io is cleared automatically here at the beginning of
+         * each translation block.  The cost is minimal and only paid for
+         * -icount, plus it would be very easy to forget doing it in the
+         * translator. Doing it here means we don't need a gen_io_end() to
+         * go with gen_io_start().
+         */
+        tcg_gen_st_i32(tcg_constant_i32(0), cpu_env,
+                       offsetof(ArchCPU, parent_obj.can_do_io) -
+                       offsetof(ArchCPU, env));
+    }
+
+    return icount_start_insn;
+}
+
+static void gen_tb_end(const TranslationBlock *tb, uint32_t cflags,
+                       TCGOp *icount_start_insn, int num_insns)
+{
+    if (cflags & CF_USE_ICOUNT) {
+        /*
+         * Update the num_insn immediate parameter now that we know
+         * the actual insn count.
+         */
+        tcg_set_insn_param(icount_start_insn, 2,
+                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
+    }
+
+    if (tcg_ctx->exitreq_label) {
+        gen_set_label(tcg_ctx->exitreq_label);
+        tcg_gen_exit_tb(tb, TB_EXIT_REQUESTED);
+    }
+}
+
 bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest)
 {
     /* Suppress goto_tb if requested. */
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
                      const TranslatorOps *ops, DisasContextBase *db)
 {
     uint32_t cflags = tb_cflags(tb);
+    TCGOp *icount_start_insn;
     bool plugin_enabled;
 
     /* Initialize DisasContext */
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
     /* Start translating.  */
-    gen_tb_start(db->tb);
+    icount_start_insn = gen_tb_start(cflags);
     ops->tb_start(db, cpu);
     tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
 
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 
     /* Emit code to exit the TB, as indicated by db->is_jmp.  */
     ops->tb_stop(db, cpu);
-    gen_tb_end(db->tb, db->num_insns);
+    gen_tb_end(tb, cflags, icount_start_insn, db->num_insns);
 
     if (plugin_enabled) {
         plugin_gen_tb_end(cpu);
-- 
2.34.1

New wrapper around gen_io_start which takes care of the USE_ICOUNT
check, as well as marking the DisasContext to end the TB.
Remove exec/gen-icount.h.

diff --git a/MAINTAINERS b/MAINTAINERS
index XXXXXXX..XXXXXXX 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -XXX,XX +XXX,XX @@ F: ui/cocoa.m
 Main loop
 M: Paolo Bonzini <pbonzini@redhat.com>
 S: Maintained
-F: include/exec/gen-icount.h
 F: include/qemu/main-loop.h
 F: include/sysemu/runstate.h
 F: include/sysemu/runstate-action.h
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/include/exec/gen-icount.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-#ifndef GEN_ICOUNT_H
-#define GEN_ICOUNT_H
-
-void gen_io_start(void);
-
-#endif
diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int *max_insns,
  */
 bool translator_use_goto_tb(DisasContextBase *db, target_ulong dest);
 
+/**
+ * translator_io_start
+ * @db: Disassembly context
+ *
+ * If icount is enabled, set cpu->can_to_io, adjust db->is_jmp to
+ * DISAS_TOO_MANY if it is still DISAS_NEXT, and return true.
+ * Otherwise return false.
+ */
+bool translator_io_start(DisasContextBase *db);
+
 /*
  * Translator Load Functions
  *
diff --git a/target/arm/cpregs.h b/target/arm/cpregs.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpregs.h
+++ b/target/arm/cpregs.h
@@ -XXX,XX +XXX,XX @@ enum {
     ARM_CP_ALIAS                 = 1 << 8,
     /*
      * Flag: Register does I/O and therefore its accesses need to be marked
-     * with gen_io_start() and also end the TB. In particular, registers which
-     * implement clocks or timers require this.
+     * with translator_io_start() and also end the TB. In particular,
+     * registers which implement clocks or timers require this.
      */
     ARM_CP_IO                    = 1 << 9,
     /*
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg.h"
 #include "tcg/tcg-op.h"
 #include "exec/exec-all.h"
-#include "exec/gen-icount.h"
 #include "exec/log.h"
 #include "exec/translator.h"
 #include "exec/plugin-gen.h"
 #include "exec/replay-core.h"
 
 
-void gen_io_start(void)
+static void gen_io_start(void)
 {
     tcg_gen_st_i32(tcg_constant_i32(1), cpu_env,
                    offsetof(ArchCPU, parent_obj.can_do_io) -
                    offsetof(ArchCPU, env));
 }
 
+bool translator_io_start(DisasContextBase *db)
+{
+    uint32_t cflags = tb_cflags(db->tb);
+
+    if (!(cflags & CF_USE_ICOUNT)) {
+        return false;
+    }
+    if (db->num_insns == db->max_insns && (cflags & CF_LAST_IO)) {
+        /* Already started in translator_loop. */
+        return true;
+    }
+
+    gen_io_start();
+
+    /*
+     * Ensure that this instruction will be the last in the TB.
+     * The target may override this to something more forceful.
+     */
+    if (db->is_jmp == DISAS_NEXT) {
+        db->is_jmp = DISAS_TOO_MANY;
+    }
+    return true;
+}
+
 static TCGOp *gen_tb_start(uint32_t cflags)
 {
     TCGv_i32 count = tcg_temp_new_i32();
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_lock_value;
 static TCGv cpu_pal_ir[31];
 #endif
 
-#include "exec/gen-icount.h"
-
 void alpha_translate_init(void)
 {
 #define DEF_VAR(V)  { &cpu_##V, #V, offsetof(CPUAlphaState, V) }
@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_mfpr(DisasContext *ctx, TCGv va, int regno)
     case 249: /* VMTIME */
         helper = gen_helper_get_vmtime;
     do_helper:
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
+        if (translator_io_start(&ctx->base)) {
             helper(va);
             return DISAS_PC_STALE;
         } else {
@@ -XXX,XX +XXX,XX @@ static DisasJumpType gen_mtpr(DisasContext *ctx, TCGv vb, int regno)
 
     case 251:
         /* ALARM */
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
+        if (translator_io_start(&ctx->base)) {
             ret = DISAS_PC_STALE;
         }
         gen_helper_set_alarm(cpu_env, vb);
@@ -XXX,XX +XXX,XX @@ static DisasJumpType translate_one(DisasContext *ctx, uint32_t insn)
         case 0xC000:
             /* RPCC */
             va = dest_gpr(ctx, ra);
-            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-                gen_io_start();
-                gen_helper_load_pcc(va, cpu_env);
+            if (translator_io_start(&ctx->base)) {
                 ret = DISAS_PC_STALE;
-            } else {
-                gen_helper_load_pcc(va, cpu_env);
             }
+            gen_helper_load_pcc(va, cpu_env);
             break;
         case 0xE000:
             /* RC */
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -XXX,XX +XXX,XX @@
 #include "internals.h"
 #include "qemu/host-utils.h"
 #include "semihosting/semihost.h"
-#include "exec/gen-icount.h"
 #include "exec/log.h"
 #include "cpregs.h"
 #include "translate-a64.h"
@@ -XXX,XX +XXX,XX @@ static bool trans_ERET(DisasContext *s, arg_ERET *a)
     tcg_gen_ld_i64(dst, cpu_env,
                    offsetof(CPUARMState, elr_el[s->current_el]));
 
-    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&s->base);
 
     gen_helper_exception_return(cpu_env, dst);
     /* Must exit loop to check un-masked IRQs */
@@ -XXX,XX +XXX,XX @@ static bool trans_ERETA(DisasContext *s, arg_reta *a)
                    offsetof(CPUARMState, elr_el[s->current_el]));
 
     dst = auth_branch_target(s, dst, cpu_X[31], !a->m);
-    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+
+    translator_io_start(&s->base);
 
     gen_helper_exception_return(cpu_env, dst);
     /* Must exit loop to check un-masked IRQs */
@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
     uint32_t key = ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
                                       crn, crm, op0, op1, op2);
     const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
+    bool need_exit_tb = false;
     TCGv_ptr tcg_ri = NULL;
     TCGv_i64 tcg_rt;
 
@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
         return;
     }
 
-    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
-        gen_io_start();
+    if (ri->type & ARM_CP_IO) {
+        /* I/O operations must end the TB here (whether read or write) */
+        need_exit_tb = translator_io_start(&s->base);
     }
 
     tcg_rt = cpu_reg(s, rt);
@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
         }
     }
 
-    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
-        /* I/O operations must end the TB here (whether read or write) */
-        s->base.is_jmp = DISAS_UPDATE_EXIT;
-    }
     if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
         /*
          * A write to any coprocessor regiser that ends a TB
@@ -XXX,XX +XXX,XX @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
          * but allow this to be suppressed by the register definition
          * (usually only necessary to work around guest bugs).
          */
+        need_exit_tb = true;
+    }
+    if (need_exit_tb) {
         s->base.is_jmp = DISAS_UPDATE_EXIT;
     }
 }
diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-mve.c
+++ b/target/arm/tcg/translate-mve.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
 #include "exec/exec-all.h"
-#include "exec/gen-icount.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
 #include "exec/exec-all.h"
-#include "exec/gen-icount.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-vfp.c
+++ b/target/arm/tcg/translate-vfp.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
 #include "exec/exec-all.h"
-#include "exec/gen-icount.h"
 #include "translate.h"
 #include "translate-a32.h"
 
@@ -XXX,XX +XXX,XX @@ static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update)
          * so we must mark it as an IO operation for icount (and cause
          * this to be the last insn in the TB).
          */
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
+        if (translator_io_start(&s->base)) {
             s->base.is_jmp = DISAS_UPDATE_EXIT;
-            gen_io_start();
         }
         gen_helper_v7m_preserve_fp_state(cpu_env);
         /*
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpregs.h"
 #include "translate.h"
 #include "translate-a32.h"
-#include "exec/gen-icount.h"
 #include "exec/helper-proto.h"
 
 #define HELPER_H "helper.h"
@@ -XXX,XX +XXX,XX @@ static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
      * appropriately depending on the new Thumb bit, so it must
      * be called after storing the new PC.
      */
-    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&s->base);
     gen_helper_cpsr_write_eret(cpu_env, cpsr);
     /* Must exit loop to check un-masked IRQs */
     s->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
     uint32_t key = ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2);
     const ARMCPRegInfo *ri = get_arm_cp_reginfo(s->cp_regs, key);
     TCGv_ptr tcg_ri = NULL;
-    bool need_exit_tb;
+    bool need_exit_tb = false;
     uint32_t syndrome;
 
     /*
@@ -XXX,XX +XXX,XX @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
         g_assert_not_reached();
     }
 
-    if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
-        gen_io_start();
+    if (ri->type & ARM_CP_IO) {
+        /* I/O operations must end the TB here (whether read or write) */
+        need_exit_tb = translator_io_start(&s->base);
     }
 
     if (isread) {
@@ -XXX,XX +XXX,XX @@ static void do_coproc_insn(DisasContext *s, int cpnum, int is64,
         }
     }
 
-    /* I/O operations must end the TB here (whether read or write) */
-    need_exit_tb = ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) &&
-                    (ri->type & ARM_CP_IO));
-
     if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
         /*
          * A write to any coprocessor register that ends a TB
@@ -XXX,XX +XXX,XX @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n)
     if (exc_return) {
         /* Restore CPSR from SPSR.  */
         tmp = load_cpu_field(spsr);
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-        }
+        translator_io_start(&s->base);
         gen_helper_cpsr_write_eret(cpu_env, tmp);
         /* Must exit loop to check un-masked IRQs */
         s->base.is_jmp = DISAS_EXIT;
diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-gen.h"
 #include "exec/log.h"
 #include "exec/translator.h"
-#include "exec/gen-icount.h"
 
 #define HELPER_H "helper.h"
 #include "exec/helper-info.c.inc"
diff --git a/target/cris/translate.c b/target/cris/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv env_btaken;
 static TCGv env_btarget;
 static TCGv env_pc;
 
-#include "exec/gen-icount.h"
-
 /* This is the state at translation time.  */
 typedef struct DisasContext {
     DisasContextBase base;
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_reg cpu_psw_v;
 static TCGv_reg cpu_psw_cb;
 static TCGv_reg cpu_psw_cb_msb;
 
-#include "exec/gen-icount.h"
-
 void hppa_translate_init(void)
 {
 #define DEF_VAR(V)  { &cpu_##V, #V, offsetof(CPUHPPAState, V) }
@@ -XXX,XX +XXX,XX @@ static bool trans_mfctl(DisasContext *ctx, arg_mfctl *a)
         /* FIXME: Respect PSW_S bit.  */
         nullify_over(ctx);
         tmp = dest_gpr(ctx, rt);
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
+        if (translator_io_start(&ctx->base)) {
             gen_helper_read_interval_timer(tmp);
             ctx->base.is_jmp = DISAS_IAQ_N_STALE;
         } else {
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_seg_base[6];
 static TCGv_i64 cpu_bndl[4];
 static TCGv_i64 cpu_bndu[4];
 
-#include "exec/gen-icount.h"
-
 typedef struct DisasContext {
     DisasContextBase base;
 
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
                 !(s->cpuid_ext_features & CPUID_EXT_RDRAND)) {
                 goto illegal_op;
             }
-            if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-                gen_io_start();
-                s->base.is_jmp = DISAS_TOO_MANY;
-            }
+            translator_io_start(&s->base);
             gen_helper_rdrand(s->T0, cpu_env);
             rm = (modrm & 7) | REX_B(s);
             gen_op_mov_reg_v(s, dflag, rm, s->T0);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
                           SVM_IOIO_TYPE_MASK | SVM_IOIO_STR_MASK)) {
             break;
         }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
             gen_repz_ins(s, ot);
         } else {
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         if (!gen_check_io(s, ot, s->tmp2_i32, SVM_IOIO_STR_MASK)) {
             break;
         }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) {
             gen_repz_outs(s, ot);
         } else {
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         if (!gen_check_io(s, ot, s->tmp2_i32, SVM_IOIO_TYPE_MASK)) {
             break;
         }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         gen_helper_in_func(ot, s->T1, s->tmp2_i32);
         gen_op_mov_reg_v(s, ot, R_EAX, s->T1);
         gen_bpt_io(s, s->tmp2_i32, ot);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         if (!gen_check_io(s, ot, s->tmp2_i32, 0)) {
             break;
         }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         gen_op_mov_v_reg(s, ot, s->T1, R_EAX);
         tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
         gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         if (!gen_check_io(s, ot, s->tmp2_i32, SVM_IOIO_TYPE_MASK)) {
             break;
         }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         gen_helper_in_func(ot, s->T1, s->tmp2_i32);
         gen_op_mov_reg_v(s, ot, R_EAX, s->T1);
         gen_bpt_io(s, s->tmp2_i32, ot);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         if (!gen_check_io(s, ot, s->tmp2_i32, 0)) {
             break;
         }
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         gen_op_mov_v_reg(s, ot, s->T1, R_EAX);
         tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
         gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
     case 0x131: /* rdtsc */
         gen_update_cc_op(s);
         gen_update_eip_cur(s);
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         gen_helper_rdtsc(cpu_env);
         break;
     case 0x133: /* rdpmc */
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
             }
             gen_update_cc_op(s);
             gen_update_eip_cur(s);
-            if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-                gen_io_start();
-                s->base.is_jmp = DISAS_TOO_MANY;
-            }
+            translator_io_start(&s->base);
             gen_helper_rdtscp(cpu_env);
             break;
 
@@ -XXX,XX +XXX,XX @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
         }
         ot  = (CODE64(s) ? MO_64 : MO_32);
 
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-            s->base.is_jmp = DISAS_TOO_MANY;
-        }
+        translator_io_start(&s->base);
         if (b & 2) {
             gen_svm_check_intercept(s, SVM_EXIT_WRITE_CR0 + reg);
             gen_op_mov_v_reg(s, ot, s->T0, rm);
diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/translate.c
+++ b/target/loongarch/translate.c
@@ -XXX,XX +XXX,XX @@
 TCGv cpu_gpr[32], cpu_pc;
 static TCGv cpu_lladdr, cpu_llval;
 
-#include "exec/gen-icount.h"
-
 #define HELPER_H "helper.h"
 #include "exec/helper-info.c.inc"
 #undef  HELPER_H
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv NULL_QREG;
 /* Used to distinguish stores from bad addressing modes.  */
 static TCGv store_dummy;
 
-#include "exec/gen-icount.h"
-
 void m68k_tcg_init(void)
 {
     char *p;
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 cpu_iflags;
 static TCGv cpu_res_addr;
 static TCGv_i32 cpu_res_val;
 
-#include "exec/gen-icount.h"
-
 /* This is the state at translation time.  */
 typedef struct DisasContext {
     DisasContextBase base;
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 hflags;
 TCGv_i32 fpu_fcr0, fpu_fcr31;
 TCGv_i64 fpu_f64[32];
 
-#include "exec/gen-icount.h"
-
 static const char regnames_HI[][4] = {
     "HI0", "HI1", "HI2", "HI3",
 };
@@ -XXX,XX +XXX,XX @@ static void gen_mfc0(DisasContext *ctx, TCGv arg, int reg, int sel)
         switch (sel) {
         case CP0_REG09__COUNT:
             /* Mark as an IO operation because we read the time.  */
-            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-                gen_io_start();
-            }
+            translator_io_start(&ctx->base);
+
             gen_helper_mfc0_count(arg, cpu_env);
             /*
              * Break the TB to be able to take timer interrupts immediately
@@ -XXX,XX +XXX,XX @@ cp0_unimplemented:
 static void gen_mtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
 {
     const char *register_name = "invalid";
+    bool icount;
 
     if (sel != 0) {
         check_insn(ctx, ISA_MIPS_R1);
     }
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    icount = translator_io_start(&ctx->base);
 
     switch (reg) {
     case CP0_REGISTER_00:
@@ -XXX,XX +XXX,XX @@ static void gen_mtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
     trace_mips_translate_c0("mtc0", register_name, reg, sel);
 
     /* For simplicity assume that all writes can cause interrupts.  */
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
+    if (icount) {
         /*
          * DISAS_STOP isn't sufficient, we need to ensure we break out of
          * translated code to check for pending interrupts.
@@ -XXX,XX +XXX,XX @@ static void gen_dmfc0(DisasContext *ctx, TCGv arg, int reg, int sel)
         switch (sel) {
         case CP0_REG09__COUNT:
             /* Mark as an IO operation because we read the time.  */
-            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-                gen_io_start();
-            }
+            translator_io_start(&ctx->base);
             gen_helper_mfc0_count(arg, cpu_env);
             /*
              * Break the TB to be able to take timer interrupts immediately
@@ -XXX,XX +XXX,XX @@ cp0_unimplemented:
 static void gen_dmtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
 {
     const char *register_name = "invalid";
+    bool icount;
 
     if (sel != 0) {
         check_insn(ctx, ISA_MIPS_R1);
     }
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    icount = translator_io_start(&ctx->base);
 
     switch (reg) {
     case CP0_REGISTER_00:
@@ -XXX,XX +XXX,XX @@ static void gen_dmtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
     trace_mips_translate_c0("dmtc0", register_name, reg, sel);
 
     /* For simplicity assume that all writes can cause interrupts.  */
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
+    if (icount) {
         /*
          * DISAS_STOP isn't sufficient, we need to ensure we break out of
          * translated code to check for pending interrupts.
@@ -XXX,XX +XXX,XX @@ void gen_rdhwr(DisasContext *ctx, int rt, int rd, int sel)
         gen_store_gpr(t0, rt);
         break;
     case 2:
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-        }
+        translator_io_start(&ctx->base);
         gen_helper_rdhwr_cc(t0, cpu_env);
         gen_store_gpr(t0, rt);
         /*
diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/cpu_ldst.h"
 #include "exec/translator.h"
 #include "qemu/qemu-print.h"
-#include "exec/gen-icount.h"
 #include "semihosting/semihost.h"
 
 #define HELPER_H "helper.h"
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
-#include "exec/gen-icount.h"
 
 #include "exec/log.h"
 
@@ -XXX,XX +XXX,XX @@ static bool trans_l_mfspr(DisasContext *dc, arg_l_mfspr *a)
 
     check_r0_write(dc, a->d);
 
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
+    if (translator_io_start(&dc->base)) {
         if (dc->delayed_branch) {
             tcg_gen_mov_tl(cpu_pc, jmp_pc);
             tcg_gen_discard_tl(jmp_pc);
@@ -XXX,XX +XXX,XX @@ static bool trans_l_mtspr(DisasContext *dc, arg_l_mtspr *a)
 {
     TCGv spr = tcg_temp_new();
 
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&dc->base);
+
     /*
      * For SR, we will need to exit the TB to recognize the new
      * exception state.  For NPC, in theory this counts as a branch
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_reserve_val2;
 static TCGv cpu_fpscr;
 static TCGv_i32 cpu_access_type;
 
-#include "exec/gen-icount.h"
-
 void ppc_translate_init(void)
 {
     int i;
@@ -XXX,XX +XXX,XX @@ static void gen_exception_nip(DisasContext *ctx, uint32_t excp,
 
 static void gen_icount_io_start(DisasContext *ctx)
 {
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-        /*
-         * An I/O instruction must be last in the TB.
-         * Chain to the next TB, and let the code from gen_tb_start
-         * decide if we need to return to the main loop.
-         * Doing this first also allows this value to be overridden.
-         */
-        ctx->base.is_jmp = DISAS_TOO_MANY;
-    }
+    translator_io_start(&ctx->base);
 }
 
 #if !defined(CONFIG_USER_ONLY)
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv load_val;
 static TCGv pm_mask;
 static TCGv pm_base;
 
-#include "exec/gen-icount.h"
-
 /*
  * If an operation is being performed on less than TARGET_LONG_BITS,
  * it may require the inputs to be sign- or zero-extended; which will
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i64 cpu_acc;
 
 #define cpu_sp cpu_regs[0]
 
-#include "exec/gen-icount.h"
-
 /* decoder helper */
 static uint32_t decode_load_bytes(DisasContext *ctx, uint32_t insn,
                            int i, int n)
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/log.h"
 #include "qemu/host-utils.h"
 #include "exec/cpu_ldst.h"
-#include "exec/gen-icount.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 
@@ -XXX,XX +XXX,XX @@ static DisasJumpType translate_one(CPUS390XState *env, DisasContext *s)
 
         /* input/output is the special case for icount mode */
         if (unlikely(insn->flags & IF_IO)) {
-            icount = tb_cflags(s->base.tb) & CF_USE_ICOUNT;
-            if (icount) {
-                gen_io_start();
-            }
+            icount = translator_io_start(&s->base);
         }
     }
 
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_fregs[32];
 /* internal register indexes */
 static TCGv cpu_flags, cpu_delayed_pc, cpu_delayed_cond;
 
-#include "exec/gen-icount.h"
-
 void sh4_translate_init(void)
 {
     int i;
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_wim;
 /* Floating point registers */
 static TCGv_i64 cpu_fpr[TARGET_DPREGS];
 
-#include "exec/gen-icount.h"
-
 typedef struct DisasContext {
     DisasContextBase base;
     target_ulong pc;    /* current Program Counter: integer or DYNAMIC_PC */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         r_const = tcg_constant_i32(dc->mem_idx);
                         tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                        offsetof(CPUSPARCState, tick));
-                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                            gen_io_start();
+                        if (translator_io_start(&dc->base)) {
+                            dc->base.is_jmp = DISAS_EXIT;
                         }
                         gen_helper_tick_get_count(cpu_dst, cpu_env, r_tickptr,
                                                   r_const);
                         gen_store_gpr(dc, rd, cpu_dst);
-                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                            /* I/O operations in icount mode must end the TB */
-                            dc->base.is_jmp = DISAS_EXIT;
-                        }
                     }
                     break;
                 case 0x5: /* V9 rdpc */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         r_const = tcg_constant_i32(dc->mem_idx);
                         tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                        offsetof(CPUSPARCState, stick));
-                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                            gen_io_start();
+                        if (translator_io_start(&dc->base)) {
+                            dc->base.is_jmp = DISAS_EXIT;
                         }
                         gen_helper_tick_get_count(cpu_dst, cpu_env, r_tickptr,
                                                   r_const);
                         gen_store_gpr(dc, rd, cpu_dst);
-                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                            /* I/O operations in icount mode must end the TB */
-                            dc->base.is_jmp = DISAS_EXIT;
-                        }
                     }
                     break;
                 case 0x19: /* System tick compare */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                         r_const = tcg_constant_i32(dc->mem_idx);
                         tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                        offsetof(CPUSPARCState, tick));
-                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                            gen_io_start();
+                        if (translator_io_start(&dc->base)) {
+                            dc->base.is_jmp = DISAS_EXIT;
                         }
                         gen_helper_tick_get_count(cpu_tmp0, cpu_env,
                                                   r_tickptr, r_const);
-                        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                            /* I/O operations in icount mode must end the TB */
-                            dc->base.is_jmp = DISAS_EXIT;
-                        }
                     }
                     break;
                 case 5: // tba
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, tick));
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_start();
-                                    }
+                                    translator_io_start(&dc->base);
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_tick_cmpr);
                                     /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, stick));
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_start();
-                                    }
+                                    translator_io_start(&dc->base);
                                     gen_helper_tick_set_count(r_tickptr,
                                                               cpu_tmp0);
                                     /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, stick));
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_start();
-                                    }
+                                    translator_io_start(&dc->base);
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_stick_cmpr);
                                     /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, tick));
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_start();
-                                    }
+                                    translator_io_start(&dc->base);
                                     gen_helper_tick_set_count(r_tickptr,
                                                               cpu_tmp0);
                                     /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 break;
                             case 6: // pstate
                                 save_state(dc);
-                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                                    gen_io_start();
-                                }
-                                gen_helper_wrpstate(cpu_env, cpu_tmp0);
-                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                                    /* I/O ops in icount mode must end the TB */
+                                if (translator_io_start(&dc->base)) {
                                     dc->base.is_jmp = DISAS_EXIT;
                                 }
+                                gen_helper_wrpstate(cpu_env, cpu_tmp0);
                                 dc->npc = DYNAMIC_PC;
                                 break;
                             case 7: // tl
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 dc->npc = DYNAMIC_PC;
                                 break;
                             case 8: // pil
-                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                                    gen_io_start();
-                                }
-                                gen_helper_wrpil(cpu_env, cpu_tmp0);
-                                if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                                    /* I/O ops in icount mode must end the TB */
+                                if (translator_io_start(&dc->base)) {
                                     dc->base.is_jmp = DISAS_EXIT;
                                 }
+                                gen_helper_wrpil(cpu_env, cpu_tmp0);
                                 break;
                             case 9: // cwp
                                 gen_helper_wrcwp(cpu_env, cpu_tmp0);
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     r_tickptr = tcg_temp_new_ptr();
                                     tcg_gen_ld_ptr(r_tickptr, cpu_env,
                                                    offsetof(CPUSPARCState, hstick));
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_start();
-                                    }
+                                    translator_io_start(&dc->base);
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_hstick_cmpr);
                                     /* End TB to handle timer interrupt */
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 goto priv_insn;
                             dc->npc = DYNAMIC_PC;
                             dc->pc = DYNAMIC_PC;
-                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                                gen_io_start();
-                            }
+                            translator_io_start(&dc->base);
                             gen_helper_done(cpu_env);
                             goto jmp_insn;
                         case 1:
@@ -XXX,XX +XXX,XX @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                 goto priv_insn;
                             dc->npc = DYNAMIC_PC;
                             dc->pc = DYNAMIC_PC;
-                            if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-                                gen_io_start();
-                            }
+                            translator_io_start(&dc->base);
                             gen_helper_retry(cpu_env);
                             goto jmp_insn;
                         default:
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv cpu_PSW_SV;
 static TCGv cpu_PSW_AV;
 static TCGv cpu_PSW_SAV;
 
-#include "exec/gen-icount.h"
-
 static const char *regnames_a[] = {
       "a0"  , "a1"  , "a2"  , "a3" , "a4"  , "a5" ,
       "a6"  , "a7"  , "a8"  , "a9" , "sp" , "a11" ,
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ static TCGv_i32 cpu_exclusive_val;
 
 static GHashTable *xtensa_regfile_table;
 
-#include "exec/gen-icount.h"
-
 static char *sr_name[256];
 static char *ur_name[256];
 
@@ -XXX,XX +XXX,XX @@ static int gen_postprocess(DisasContext *dc, int slot)
 
 #ifndef CONFIG_USER_ONLY
     if (op_flags & XTENSA_OP_CHECK_INTERRUPTS) {
-        if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-        }
+        translator_io_start(&dc->base);
         gen_helper_check_interrupts(cpu_env);
     }
 #endif
@@ -XXX,XX +XXX,XX @@ static void translate_rsr_ccount(DisasContext *dc, const OpcodeArg arg[],
                                  const uint32_t par[])
 {
 #ifndef CONFIG_USER_ONLY
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&dc->base);
     gen_helper_update_ccount(cpu_env);
     tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]);
 #endif
@@ -XXX,XX +XXX,XX @@ static void translate_waiti(DisasContext *dc, const OpcodeArg arg[],
 #ifndef CONFIG_USER_ONLY
     TCGv_i32 pc = tcg_constant_i32(dc->base.pc_next);
 
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&dc->base);
     gen_helper_waiti(cpu_env, pc, tcg_constant_i32(arg[0].imm));
 #endif
 }
@@ -XXX,XX +XXX,XX @@ static void translate_wsr_ccompare(DisasContext *dc, const OpcodeArg arg[],
     uint32_t id = par[0] - CCOMPARE;
 
     assert(id < dc->config->nccompare);
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&dc->base);
     tcg_gen_mov_i32(cpu_SR[par[0]], arg[0].in);
     gen_helper_update_ccompare(cpu_env, tcg_constant_i32(id));
 #endif
@@ -XXX,XX +XXX,XX @@ static void translate_wsr_ccount(DisasContext *dc, const OpcodeArg arg[],
                                  const uint32_t par[])
 {
 #ifndef CONFIG_USER_ONLY
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&dc->base);
     gen_helper_wsr_ccount(cpu_env, arg[0].in);
 #endif
 }
@@ -XXX,XX +XXX,XX @@ static void translate_xsr_ccount(DisasContext *dc, const OpcodeArg arg[],
 #ifndef CONFIG_USER_ONLY
     TCGv_i32 tmp = tcg_temp_new_i32();
 
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
-
+    translator_io_start(&dc->base);
     gen_helper_update_ccount(cpu_env);
     tcg_gen_mov_i32(tmp, cpu_SR[par[0]]);
     gen_helper_wsr_ccount(cpu_env, arg[0].in);
diff --git a/target/loongarch/insn_trans/trans_extra.c.inc b/target/loongarch/insn_trans/trans_extra.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insn_trans/trans_extra.c.inc
+++ b/target/loongarch/insn_trans/trans_extra.c.inc
@@ -XXX,XX +XXX,XX @@ static bool gen_rdtime(DisasContext *ctx, arg_rr *a,
     TCGv dst1 = gpr_dst(ctx, a->rd, EXT_NONE);
     TCGv dst2 = gpr_dst(ctx, a->rj, EXT_NONE);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_rdtime_d(dst1, cpu_env);
     if (word) {
         tcg_gen_sextract_tl(dst1, dst1, high ? 32 : 0, 32);
diff --git a/target/loongarch/insn_trans/trans_privileged.c.inc b/target/loongarch/insn_trans/trans_privileged.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/insn_trans/trans_privileged.c.inc
+++ b/target/loongarch/insn_trans/trans_privileged.c.inc
@@ -XXX,XX +XXX,XX @@ static bool check_csr_flags(DisasContext *ctx, const CSRInfo *csr, bool write)
     if ((csr->flags & CSRFL_READONLY) && write) {
         return false;
     }
-    if ((csr->flags & CSRFL_IO) &&
-        (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT)) {
-        gen_io_start();
+    if ((csr->flags & CSRFL_IO) && translator_io_start(&ctx->base)) {
         ctx->base.is_jmp = DISAS_EXIT_UPDATE;
     } else if ((csr->flags & CSRFL_EXITTB) && write) {
         ctx->base.is_jmp = DISAS_EXIT_UPDATE;
diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/insn_trans/trans_privileged.c.inc
+++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_sret(DisasContext *ctx, arg_sret *a)
 #ifndef CONFIG_USER_ONLY
     if (has_ext(ctx, RVS)) {
         decode_save_opc(ctx);
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_start();
-        }
+        translator_io_start(&ctx->base);
         gen_helper_sret(cpu_pc, cpu_env);
         exit_tb(ctx); /* no chaining */
         ctx->base.is_jmp = DISAS_NORETURN;
@@ -XXX,XX +XXX,XX @@ static bool trans_mret(DisasContext *ctx, arg_mret *a)
 {
 #ifndef CONFIG_USER_ONLY
     decode_save_opc(ctx);
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_mret(cpu_pc, cpu_env);
     exit_tb(ctx); /* no chaining */
     ctx->base.is_jmp = DISAS_NORETURN;
diff --git a/target/riscv/insn_trans/trans_rvi.c.inc b/target/riscv/insn_trans/trans_rvi.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/insn_trans/trans_rvi.c.inc
+++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -XXX,XX +XXX,XX @@ static bool do_csrr(DisasContext *ctx, int rd, int rc)
     TCGv dest = dest_gpr(ctx, rd);
     TCGv_i32 csr = tcg_constant_i32(rc);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_csrr(dest, cpu_env, csr);
     gen_set_gpr(ctx, rd, dest);
     return do_csr_post(ctx);
@@ -XXX,XX +XXX,XX @@ static bool do_csrw(DisasContext *ctx, int rc, TCGv src)
 {
     TCGv_i32 csr = tcg_constant_i32(rc);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_csrw(cpu_env, csr, src);
     return do_csr_post(ctx);
 }
@@ -XXX,XX +XXX,XX @@ static bool do_csrrw(DisasContext *ctx, int rd, int rc, TCGv src, TCGv mask)
     TCGv dest = dest_gpr(ctx, rd);
     TCGv_i32 csr = tcg_constant_i32(rc);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_csrrw(dest, cpu_env, csr, src, mask);
     gen_set_gpr(ctx, rd, dest);
     return do_csr_post(ctx);
@@ -XXX,XX +XXX,XX @@ static bool do_csrr_i128(DisasContext *ctx, int rd, int rc)
     TCGv desth = dest_gprh(ctx, rd);
     TCGv_i32 csr = tcg_constant_i32(rc);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_csrr_i128(destl, cpu_env, csr);
     tcg_gen_ld_tl(desth, cpu_env, offsetof(CPURISCVState, retxh));
     gen_set_gpr128(ctx, rd, destl, desth);
@@ -XXX,XX +XXX,XX @@ static bool do_csrw_i128(DisasContext *ctx, int rc, TCGv srcl, TCGv srch)
 {
     TCGv_i32 csr = tcg_constant_i32(rc);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_csrw_i128(cpu_env, csr, srcl, srch);
     return do_csr_post(ctx);
 }
@@ -XXX,XX +XXX,XX @@ static bool do_csrrw_i128(DisasContext *ctx, int rd, int rc,
     TCGv desth = dest_gprh(ctx, rd);
     TCGv_i32 csr = tcg_constant_i32(rc);
 
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_start();
-    }
+    translator_io_start(&ctx->base);
     gen_helper_csrrw_i128(destl, cpu_env, csr, srcl, srch, maskl, maskh);
     tcg_gen_ld_tl(desth, cpu_env, offsetof(CPURISCVState, retxh));
     gen_set_gpr128(ctx, rd, destl, desth);
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

Now that gen_icount_io_start() is a simple wrapper to
translator_io_start(), inline it.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20230602095439.48102-1-philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/translate.c                 | 63 ++++++++++++--------------
 target/ppc/power8-pmu-regs.c.inc       | 10 ++--
 target/ppc/translate/branch-impl.c.inc |  2 +-
 3 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_exception_nip(DisasContext *ctx, uint32_t excp,
     ctx->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_icount_io_start(DisasContext *ctx)
-{
-    translator_io_start(&ctx->base);
-}
-
 #if !defined(CONFIG_USER_ONLY)
 static void gen_ppc_maybe_interrupt(DisasContext *ctx)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_ppc_maybe_interrupt(cpu_env);
 }
 #endif
@@ -XXX,XX +XXX,XX @@ void spr_write_ureg(DisasContext *ctx, int sprn, int gprn)
 #if !defined(CONFIG_USER_ONLY)
 void spr_read_decr(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_decr(cpu_gpr[gprn], cpu_env);
 }
 
 void spr_write_decr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_decr(cpu_env, cpu_gpr[gprn]);
 }
 #endif
@@ -XXX,XX +XXX,XX @@ void spr_write_decr(DisasContext *ctx, int sprn, int gprn)
 /* Time base */
 void spr_read_tbl(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_tbl(cpu_gpr[gprn], cpu_env);
 }
 
 void spr_read_tbu(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_tbu(cpu_gpr[gprn], cpu_env);
 }
 
@@ -XXX,XX +XXX,XX @@ void spr_read_atbu(DisasContext *ctx, int gprn, int sprn)
 #if !defined(CONFIG_USER_ONLY)
 void spr_write_tbl(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_tbl(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_write_tbu(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_tbu(cpu_env, cpu_gpr[gprn]);
 }
 
@@ -XXX,XX +XXX,XX @@ void spr_write_atbu(DisasContext *ctx, int sprn, int gprn)
 #if defined(TARGET_PPC64)
 void spr_read_purr(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_purr(cpu_gpr[gprn], cpu_env);
 }
 
 void spr_write_purr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_purr(cpu_env, cpu_gpr[gprn]);
 }
 
 /* HDECR */
 void spr_read_hdecr(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_hdecr(cpu_gpr[gprn], cpu_env);
 }
 
 void spr_write_hdecr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_hdecr(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_read_vtb(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_vtb(cpu_gpr[gprn], cpu_env);
 }
 
 void spr_write_vtb(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_vtb(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_write_tbu40(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_tbu40(cpu_env, cpu_gpr[gprn]);
 }
 
@@ -XXX,XX +XXX,XX @@ void spr_write_dpdes(DisasContext *ctx, int sprn, int gprn)
 #if !defined(CONFIG_USER_ONLY)
 void spr_read_40x_pit(DisasContext *ctx, int gprn, int sprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_load_40x_pit(cpu_gpr[gprn], cpu_env);
 }
 
 void spr_write_40x_pit(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_40x_pit(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_write_40x_dbcr0(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_store_spr(sprn, cpu_gpr[gprn]);
     gen_helper_store_40x_dbcr0(cpu_env, cpu_gpr[gprn]);
     /* We must stop translation as we may have rebooted */
@@ -XXX,XX +XXX,XX @@ void spr_write_40x_dbcr0(DisasContext *ctx, int sprn, int gprn)
 
 void spr_write_40x_sler(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_40x_sler(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_write_40x_tcr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_40x_tcr(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_write_40x_tsr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_40x_tsr(cpu_env, cpu_gpr[gprn]);
 }
 
@@ -XXX,XX +XXX,XX @@ void spr_write_40x_pid(DisasContext *ctx, int sprn, int gprn)
 
 void spr_write_booke_tcr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_booke_tcr(cpu_env, cpu_gpr[gprn]);
 }
 
 void spr_write_booke_tsr(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_booke_tsr(cpu_env, cpu_gpr[gprn]);
 }
 #endif
@@ -XXX,XX +XXX,XX @@ static void gen_darn(DisasContext *ctx)
     if (l > 2) {
         tcg_gen_movi_i64(cpu_gpr[rD(ctx->opcode)], -1);
     } else {
-        gen_icount_io_start(ctx);
+        translator_io_start(&ctx->base);
         if (l == 0) {
             gen_helper_darn32(cpu_gpr[rD(ctx->opcode)]);
         } else {
@@ -XXX,XX +XXX,XX @@ static void pmu_count_insns(DisasContext *ctx)
      * running with icount and we do not handle it beforehand,
      * the helper can trigger a 'bad icount read'.
      */
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
 
     /* Avoid helper calls when only PMC5-6 are enabled. */
     if (!ctx->pmc_other) {
@@ -XXX,XX +XXX,XX @@ static void gen_rfi(DisasContext *ctx)
     }
     /* Restore CPU state */
     CHK_SV(ctx);
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_update_cfar(ctx, ctx->cia);
     gen_helper_rfi(cpu_env);
     ctx->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void gen_rfid(DisasContext *ctx)
 #else
     /* Restore CPU state */
     CHK_SV(ctx);
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_update_cfar(ctx, ctx->cia);
     gen_helper_rfid(cpu_env);
     ctx->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void gen_rfscv(DisasContext *ctx)
 #else
     /* Restore CPU state */
     CHK_SV(ctx);
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_update_cfar(ctx, ctx->cia);
     gen_helper_rfscv(cpu_env);
     ctx->base.is_jmp = DISAS_EXIT;
@@ -XXX,XX +XXX,XX @@ static void gen_mtmsrd(DisasContext *ctx)
     t0 = tcg_temp_new();
     t1 = tcg_temp_new();
 
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
 
     if (ctx->opcode & 0x00010000) {
         /* L=1 form only updates EE and RI */
@@ -XXX,XX +XXX,XX @@ static void gen_mtmsr(DisasContext *ctx)
     t0 = tcg_temp_new();
     t1 = tcg_temp_new();
 
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     if (ctx->opcode & 0x00010000) {
         /* L=1 form only updates EE and RI */
         mask &= (1ULL << MSR_RI) | (1ULL << MSR_EE);
diff --git a/target/ppc/power8-pmu-regs.c.inc b/target/ppc/power8-pmu-regs.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/power8-pmu-regs.c.inc
+++ b/target/ppc/power8-pmu-regs.c.inc
@@ -XXX,XX +XXX,XX @@ static void write_MMCR0_common(DisasContext *ctx, TCGv val)
     /*
      * helper_store_mmcr0 will make clock based operations that
      * will cause 'bad icount read' errors if we do not execute
-     * gen_icount_io_start() beforehand.
+     * translator_io_start() beforehand.
      */
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_mmcr0(cpu_env, val);
 
     /*
@@ -XXX,XX +XXX,XX @@ void spr_read_PMC(DisasContext *ctx, int gprn, int sprn)
 {
     TCGv_i32 t_sprn = tcg_constant_i32(sprn);
 
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_read_pmc(cpu_gpr[gprn], cpu_env, t_sprn);
 }
 
@@ -XXX,XX +XXX,XX @@ void spr_write_PMC(DisasContext *ctx, int sprn, int gprn)
 {
     TCGv_i32 t_sprn = tcg_constant_i32(sprn);
 
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_pmc(cpu_env, t_sprn, cpu_gpr[gprn]);
 }
 
@@ -XXX,XX +XXX,XX @@ void spr_write_MMCR0(DisasContext *ctx, int sprn, int gprn)
 
 void spr_write_MMCR1(DisasContext *ctx, int sprn, int gprn)
 {
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_helper_store_mmcr1(cpu_env, cpu_gpr[gprn]);
 }
 #else
diff --git a/target/ppc/translate/branch-impl.c.inc b/target/ppc/translate/branch-impl.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate/branch-impl.c.inc
+++ b/target/ppc/translate/branch-impl.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_RFEBB(DisasContext *ctx, arg_XL_s *arg)
 {
     REQUIRE_INSNS_FLAGS2(ctx, ISA207S);
 
-    gen_icount_io_start(ctx);
+    translator_io_start(&ctx->base);
     gen_update_cfar(ctx, ctx->cia);
     gen_helper_rfebb(cpu_env, cpu_gpr[arg->s]);
 
-- 
2.34.1

This is used by exactly one host in extraordinary circumstances.
This means that translator.h need not include plugin-gen.h;
translator.c already includes plugin-gen.h.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/translator.h | 8 +-------
 accel/tcg/translator.c    | 5 +++++
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/bswap.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
-#include "exec/plugin-gen.h"
 #include "exec/translate-all.h"
 #include "tcg/tcg.h"
 
@@ -XXX,XX +XXX,XX @@ translator_ldq_swap(CPUArchState *env, DisasContextBase *db,
  * re-synthesised for s390x "ex"). It ensures we update other areas of
  * the translator with details of the executed instruction.
  */
-
-static inline void translator_fake_ldb(uint8_t insn8, abi_ptr pc)
-{
-    plugin_insn_append(pc, &insn8, sizeof(insn8));
-}
-
+void translator_fake_ldb(uint8_t insn8, abi_ptr pc);
 
 /*
  * Return whether addr is on the same page as where disassembly started.
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
     plugin_insn_append(pc, &plug, sizeof(ret));
     return ret;
 }
+
+void translator_fake_ldb(uint8_t insn8, abi_ptr pc)
+{
+    plugin_insn_append(pc, &insn8, sizeof(insn8));
+}
-- 
2.34.1

Move most includes from *translate*.c to translate.h, ensuring
that we get the ordering correct.  Ensure cpu.h is first.
Use disas/disas.h instead of exec/log.h.
Drop otherwise unused includes.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/translate.h        |  3 +++
 target/arm/tcg/translate-a64.c    | 17 +++++------------
 target/arm/tcg/translate-m-nocp.c |  2 --
 target/arm/tcg/translate-mve.c    |  3 ---
 target/arm/tcg/translate-neon.c   |  3 ---
 target/arm/tcg/translate-sme.c    |  6 ------
 target/arm/tcg/translate-sve.c    |  9 ---------
 target/arm/tcg/translate-vfp.c    |  3 ---
 target/arm/tcg/translate.c        | 17 +++++------------
 9 files changed, 13 insertions(+), 50 deletions(-)

diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TARGET_ARM_TRANSLATE_H
 #define TARGET_ARM_TRANSLATE_H
 
+#include "cpu.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "exec/translator.h"
 #include "exec/helper-gen.h"
 #include "internals.h"
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
@@ -XXX,XX +XXX,XX @@
  */
 #include "qemu/osdep.h"
 
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "qemu/log.h"
-#include "arm_ldst.h"
 #include "translate.h"
-#include "internals.h"
-#include "qemu/host-utils.h"
-#include "semihosting/semihost.h"
-#include "exec/log.h"
-#include "cpregs.h"
 #include "translate-a64.h"
-#include "qemu/atomic128.h"
+#include "qemu/log.h"
+#include "disas/disas.h"
+#include "arm_ldst.h"
+#include "semihosting/semihost.h"
+#include "cpregs.h"
 
 static TCGv_i64 cpu_X[32];
 static TCGv_i64 cpu_pc;
diff --git a/target/arm/tcg/translate-m-nocp.c b/target/arm/tcg/translate-m-nocp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-m-nocp.c
+++ b/target/arm/tcg/translate-m-nocp.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-mve.c b/target/arm/tcg/translate-mve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-mve.c
+++ b/target/arm/tcg/translate-mve.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-neon.c b/target/arm/tcg/translate-neon.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "cpu.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "tcg/tcg-gvec-desc.h"
 #include "translate.h"
 #include "translate-a64.h"
-#include "fpu/softfloat.h"
-
 
 /*
  * Include the generated decoder.
diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "qemu/log.h"
-#include "arm_ldst.h"
 #include "translate.h"
-#include "internals.h"
-#include "exec/log.h"
 #include "translate-a64.h"
 #include "fpu/softfloat.h"
 
diff --git a/target/arm/tcg/translate-vfp.c b/target/arm/tcg/translate-vfp.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate-vfp.c
+++ b/target/arm/tcg/translate-vfp.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/exec-all.h"
 #include "translate.h"
 #include "translate-a32.h"
 
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  */
 #include "qemu/osdep.h"
 
-#include "cpu.h"
-#include "internals.h"
-#include "disas/disas.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "qemu/log.h"
-#include "qemu/bitops.h"
-#include "arm_ldst.h"
-#include "semihosting/semihost.h"
-#include "exec/log.h"
-#include "cpregs.h"
 #include "translate.h"
 #include "translate-a32.h"
+#include "qemu/log.h"
+#include "disas/disas.h"
+#include "arm_ldst.h"
+#include "semihosting/semihost.h"
+#include "cpregs.h"
 #include "exec/helper-proto.h"
 
 #define HELPER_H "helper.h"
-- 
2.34.1

Move most includes from *translate*.c to translate.h, ensuring
that we get the ordering correct.  Ensure cpu.h is first.
Use disas/disas.h instead of exec/log.h.
Drop otherwise unused includes.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/tcg/translate.h            |  6 ++++--
 target/mips/tcg/msa_translate.c        |  3 ---
 target/mips/tcg/mxu_translate.c        |  2 --
 target/mips/tcg/octeon_translate.c     |  4 +---
 target/mips/tcg/rel6_translate.c       |  2 --
 target/mips/tcg/translate.c            | 18 ++++++------------
 target/mips/tcg/translate_addr_const.c |  1 -
 target/mips/tcg/tx79_translate.c       |  4 +---
 target/mips/tcg/vr54xx_translate.c     |  3 ---
 9 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/target/mips/tcg/translate.h b/target/mips/tcg/translate.h
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.h
+++ b/target/mips/tcg/translate.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TARGET_MIPS_TRANSLATE_H
 #define TARGET_MIPS_TRANSLATE_H
 
-#include "qemu/log.h"
-#include "exec/translator.h"
+#include "cpu.h"
 #include "tcg/tcg-op.h"
+#include "exec/translator.h"
+#include "exec/helper-gen.h"
+#include "qemu/log.h"
 
 #define MIPS_DEBUG_DISAS 0
 
diff --git a/target/mips/tcg/msa_translate.c b/target/mips/tcg/msa_translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/msa_translate.c
+++ b/target/mips/tcg/msa_translate.c
@@ -XXX,XX +XXX,XX @@
  * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "exec/helper-gen.h"
 #include "translate.h"
 #include "fpu_helper.h"
-#include "internal.h"
 
 static int elm_n(DisasContext *ctx, int x);
 static int elm_df(DisasContext *ctx, int x);
diff --git a/target/mips/tcg/mxu_translate.c b/target/mips/tcg/mxu_translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/mxu_translate.c
+++ b/target/mips/tcg/mxu_translate.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "exec/helper-gen.h"
 #include "translate.h"
 
 /*
diff --git a/target/mips/tcg/octeon_translate.c b/target/mips/tcg/octeon_translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/octeon_translate.c
+++ b/target/mips/tcg/octeon_translate.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/helper-gen.h"
 #include "translate.h"
+#include "tcg/tcg-op-gvec.h"
 
 /* Include the auto-generated decoder.  */
 #include "decode-octeon.c.inc"
diff --git a/target/mips/tcg/rel6_translate.c b/target/mips/tcg/rel6_translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/rel6_translate.c
+++ b/target/mips/tcg/rel6_translate.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "exec/helper-gen.h"
 #include "translate.h"
 
 /* Include the auto-generated decoders.  */
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "cpu.h"
-#include "internal.h"
-#include "tcg/tcg-op.h"
-#include "exec/translator.h"
-#include "exec/helper-proto.h"
-#include "exec/helper-gen.h"
-#include "semihosting/semihost.h"
-
-#include "trace.h"
-#include "exec/log.h"
-#include "qemu/qemu-print.h"
-#include "fpu_helper.h"
 #include "translate.h"
+#include "internal.h"
+#include "exec/helper-proto.h"
+#include "semihosting/semihost.h"
+#include "trace.h"
+#include "disas/disas.h"
+#include "fpu_helper.h"
 
 #define HELPER_H "helper.h"
 #include "exec/helper-info.c.inc"
diff --git a/target/mips/tcg/translate_addr_const.c b/target/mips/tcg/translate_addr_const.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate_addr_const.c
+++ b/target/mips/tcg/translate_addr_const.c
@@ -XXX,XX +XXX,XX @@
  * SPDX-License-Identifier: LGPL-2.1-or-later
  */
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
 #include "translate.h"
 
 bool gen_lsa(DisasContext *ctx, int rd, int rt, int rs, int sa)
diff --git a/target/mips/tcg/tx79_translate.c b/target/mips/tcg/tx79_translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/tx79_translate.c
+++ b/target/mips/tcg/tx79_translate.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "tcg/tcg-op-gvec.h"
-#include "exec/helper-gen.h"
 #include "translate.h"
+#include "tcg/tcg-op-gvec.h"
 
 /* Include the auto-generated decoder.  */
 #include "decode-tx79.c.inc"
diff --git a/target/mips/tcg/vr54xx_translate.c b/target/mips/tcg/vr54xx_translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/vr54xx_translate.c
+++ b/target/mips/tcg/vr54xx_translate.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "tcg/tcg-op.h"
-#include "exec/helper-gen.h"
 #include "translate.h"
-#include "internal.h"
 
 /* Include the auto-generated decoder. */
 #include "decode-vr54xx.c.inc"
-- 
2.34.1

This had been pulled in via exec/exec-all.h, via exec/translator.h,
but the include of exec-all.h will be removed.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/translate.c   | 1 +
 target/loongarch/translate.c | 3 +--
 target/mips/tcg/translate.c  | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-op-gvec.h"
 #include "exec/helper-gen.h"
 #include "exec/helper-proto.h"
+#include "exec/translation-block.h"
 #include "exec/cpu_ldst.h"
 #include "exec/log.h"
 #include "internal.h"
diff --git a/target/loongarch/translate.c b/target/loongarch/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/translate.c
+++ b/target/loongarch/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
-
+#include "exec/translation-block.h"
 #include "exec/translator.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
-
 #include "exec/log.h"
 #include "qemu/qemu-print.h"
 #include "fpu/softfloat.h"
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@
 #include "translate.h"
 #include "internal.h"
 #include "exec/helper-proto.h"
+#include "exec/translation-block.h"
 #include "semihosting/semihost.h"
 #include "trace.h"
 #include "disas/disas.h"
-- 
2.34.1

Reduce the header to only bswap.h and cpu_ldst.h.
Move exec/translate-all.h to translator.c.
Reduce tcg.h and tcg-op.h to tcg-op-common.h.
Remove otherwise unused headers.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/translator.h | 6 +-----
 accel/tcg/translator.c    | 8 +++-----
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@
  * member in your target-specific DisasContext.
  */
 
-
 #include "qemu/bswap.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "exec/translate-all.h"
-#include "tcg/tcg.h"
+#include "exec/cpu_ldst.h"	/* for abi_ptr */
 
 /**
  * gen_intermediate_code
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "qemu/error-report.h"
-#include "tcg/tcg.h"
-#include "tcg/tcg-op.h"
 #include "exec/exec-all.h"
-#include "exec/log.h"
 #include "exec/translator.h"
+#include "exec/translate-all.h"
 #include "exec/plugin-gen.h"
-#include "exec/replay-core.h"
-
+#include "tcg/tcg-op-common.h"
 
 static void gen_io_start(void)
 {
-- 
2.34.1

The bug was hidden because they happen to have the same values.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/region.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tcg/region.c b/tcg/region.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/region.c
+++ b/tcg/region.c
@@ -XXX,XX +XXX,XX @@ static int alloc_code_gen_buffer(size_t tb_size, int splitwx, Error **errp)
     return PROT_READ | PROT_WRITE;
 }
 #elif defined(_WIN32)
+/*
+ * Local source-level compatibility with Unix.
+ * Used by tcg_region_init below.
+ */
+#define PROT_READ   1
+#define PROT_WRITE  2
+#define PROT_EXEC   4
+
 static int alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
 {
     void *buf;
@@ -XXX,XX +XXX,XX @@ static int alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
     region.start_aligned = buf;
     region.total_size = size;
 
-    return PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+    return PROT_READ | PROT_WRITE | PROT_EXEC;
 }
 #else
 static int alloc_code_gen_buffer_anon(size_t size, int prot,
@@ -XXX,XX +XXX,XX @@ void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
      * buffer -- let that one use hugepages throughout.
      * Work with the page protections set up with the initial mapping.
      */
-    need_prot = PAGE_READ | PAGE_WRITE;
+    need_prot = PROT_READ | PROT_WRITE;
 #ifndef CONFIG_TCG_INTERPRETER
     if (tcg_splitwx_diff == 0) {
-        need_prot |= PAGE_EXEC;
+        need_prot |= PROT_EXEC;
     }
 #endif
     for (size_t i = 0, n = region.n; i < n; i++) {
@@ -XXX,XX +XXX,XX @@ void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
         if (have_prot != need_prot) {
             int rc;
 
-            if (need_prot == (PAGE_READ | PAGE_WRITE | PAGE_EXEC)) {
+            if (need_prot == (PROT_READ | PROT_WRITE | PROT_EXEC)) {
                 rc = qemu_mprotect_rwx(start, end - start);
-            } else if (need_prot == (PAGE_READ | PAGE_WRITE)) {
+            } else if (need_prot == (PROT_READ | PROT_WRITE)) {
                 rc = qemu_mprotect_rw(start, end - start);
             } else {
                 g_assert_not_reached();
-- 
2.34.1

Since the change to CPUArchState, we have a common typedef
that can always be used.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/helper-head.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/exec/helper-head.h b/include/exec/helper-head.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@@ -XXX,XX +XXX,XX @@
 #define dh_alias_f64 i64
 #define dh_alias_ptr ptr
 #define dh_alias_cptr ptr
+#define dh_alias_env ptr
 #define dh_alias_void void
 #define dh_alias_noreturn noreturn
 #define dh_alias(t) glue(dh_alias_, t)
@@ -XXX,XX +XXX,XX @@
 #define dh_ctype_f64 float64
 #define dh_ctype_ptr void *
 #define dh_ctype_cptr const void *
+#define dh_ctype_env CPUArchState *
 #define dh_ctype_void void
 #define dh_ctype_noreturn G_NORETURN void
 #define dh_ctype(t) dh_ctype_##t
@@ -XXX,XX +XXX,XX @@
 #  endif
 # endif
 # define dh_ctype_tl target_ulong
-# define dh_alias_env ptr
-# define dh_ctype_env CPUArchState *
-# define dh_typecode_env dh_typecode_ptr
 #endif
 
 /* We can't use glue() here because it falls foul of C preprocessor
@@ -XXX,XX +XXX,XX @@
 #define dh_typecode_f32 dh_typecode_i32
 #define dh_typecode_f64 dh_typecode_i64
 #define dh_typecode_cptr dh_typecode_ptr
+#define dh_typecode_env dh_typecode_ptr
 #define dh_typecode(t) dh_typecode_##t
 
 #define dh_callflag_i32  0
-- 
2.34.1

This finally paves the way for tcg/ to be built once per mode.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h      | 1 -
 accel/tcg/plugin-gen.c | 1 +
 tcg/region.c           | 2 +-
 tcg/tcg-op.c           | 2 +-
 tcg/tcg.c              | 2 +-
 5 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_H
 #define TCG_H
 
-#include "cpu.h"
 #include "exec/memop.h"
 #include "exec/memopidx.h"
 #include "qemu/bitops.h"
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@
  * CPU's index into a TCG temp, since the first callback did it already.
  */
 #include "qemu/osdep.h"
+#include "cpu.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
 #include "tcg/tcg-op.h"
diff --git a/tcg/region.c b/tcg/region.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/region.c
+++ b/tcg/region.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/cacheinfo.h"
 #include "qemu/qtree.h"
 #include "qapi/error.h"
-#include "exec/exec-all.h"
 #include "tcg/tcg.h"
+#include "exec/translation-block.h"
 #include "tcg-internal.h"
 
 
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
-#include "exec/exec-all.h"
 #include "tcg/tcg.h"
 #include "tcg/tcg-temp-internal.h"
 #include "tcg/tcg-op-common.h"
+#include "exec/translation-block.h"
 #include "exec/plugin-gen.h"
 #include "tcg-internal.h"
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/cacheflush.h"
 #include "qemu/cacheinfo.h"
 #include "qemu/timer.h"
-#include "exec/exec-all.h"
+#include "exec/translation-block.h"
 #include "exec/tlb-common.h"
 #include "tcg/tcg-op-common.h"
 
-- 
2.34.1

This function is only used in translator.c, and uses a
target-specific typedef: abi_ptr.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/plugin-gen.h | 22 ----------------------
 accel/tcg/translator.c    | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/plugin-gen.h
+++ b/include/exec/plugin-gen.h
@@ -XXX,XX +XXX,XX @@ void plugin_gen_insn_end(void);
 void plugin_gen_disable_mem_helpers(void);
 void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info);
 
-static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
-{
-    struct qemu_plugin_insn *insn = tcg_ctx->plugin_insn;
-    abi_ptr off;
-
-    if (insn == NULL) {
-        return;
-    }
-    off = pc - insn->vaddr;
-    if (off < insn->data->len) {
-        g_byte_array_set_size(insn->data, off);
-    } else if (off > insn->data->len) {
-        /* we have an unexpected gap */
-        g_assert_not_reached();
-    }
-
-    insn->data = g_byte_array_append(insn->data, from, size);
-}
-
 #else /* !CONFIG_PLUGIN */
 
 static inline bool
@@ -XXX,XX +XXX,XX @@ static inline void plugin_gen_disable_mem_helpers(void)
 static inline void plugin_gen_empty_mem_callback(TCGv_i64 addr, uint32_t info)
 { }
 
-static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
-{ }
-
 #endif /* CONFIG_PLUGIN */
 
 #endif /* QEMU_PLUGIN_GEN_H */
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -XXX,XX +XXX,XX @@ static void *translator_access(CPUArchState *env, DisasContextBase *db,
     return host + (pc - base);
 }
 
+static void plugin_insn_append(abi_ptr pc, const void *from, size_t size)
+{
+#ifdef CONFIG_PLUGIN
+    struct qemu_plugin_insn *insn = tcg_ctx->plugin_insn;
+    abi_ptr off;
+
+    if (insn == NULL) {
+        return;
+    }
+    off = pc - insn->vaddr;
+    if (off < insn->data->len) {
+        g_byte_array_set_size(insn->data, off);
+    } else if (off > insn->data->len) {
+        /* we have an unexpected gap */
+        g_assert_not_reached();
+    }
+
+    insn->data = g_byte_array_append(insn->data, from, size);
+#endif
+}
+
 uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, abi_ptr pc)
 {
     uint8_t ret;
-- 
2.34.1

If CONFIG_USER_ONLY is ok generically, so is CONFIG_SOFTMMU,
because they are exactly opposite.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/poison.h         | 1 -
 scripts/make-config-poison.sh | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/exec/poison.h b/include/exec/poison.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/poison.h
+++ b/include/exec/poison.h
@@ -XXX,XX +XXX,XX @@
 #pragma GCC poison CONFIG_HVF
 #pragma GCC poison CONFIG_LINUX_USER
 #pragma GCC poison CONFIG_KVM
-#pragma GCC poison CONFIG_SOFTMMU
 #pragma GCC poison CONFIG_WHPX
 #pragma GCC poison CONFIG_XEN
 
diff --git a/scripts/make-config-poison.sh b/scripts/make-config-poison.sh
index XXXXXXX..XXXXXXX 100755
--- a/scripts/make-config-poison.sh
+++ b/scripts/make-config-poison.sh
@@ -XXX,XX +XXX,XX @@ if test $# = 0; then
   exit 0
 fi
 
-# Create list of config switches that should be poisoned in common code...
-# but filter out CONFIG_TCG and CONFIG_USER_ONLY which are special.
+# Create list of config switches that should be poisoned in common code,
+# but filter out several which are handled manually.
 exec sed -n \
   -e' /CONFIG_TCG/d' \
   -e '/CONFIG_USER_ONLY/d' \
+  -e '/CONFIG_SOFTMMU/d' \
   -e '/^#define / {' \
   -e    's///' \
   -e    's/ .*//' \
-- 
2.34.1

Create two static libraries for use by each execution mode.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/meson.build | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/tcg/meson.build b/tcg/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tcg/meson.build
+++ b/tcg/meson.build
@@ -XXX,XX +XXX,XX @@
+if not get_option('tcg').allowed()
+   subdir_done()
+endif
+
 tcg_ss = ss.source_set()
 
 tcg_ss.add(files(
@@ -XXX,XX +XXX,XX @@ tcg_ss.add(files(
 if get_option('tcg_interpreter')
   libffi = dependency('libffi', version: '>=3.0', required: true,
                       method: 'pkg-config')
-  specific_ss.add(libffi)
-  specific_ss.add(files('tci.c'))
+  tcg_ss.add(libffi)
+  tcg_ss.add(files('tci.c'))
 endif
 
-specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
+tcg_ss = tcg_ss.apply(config_host, strict: false)
+
+libtcg_user = static_library('tcg_user',
+                             tcg_ss.sources() + genh,
+                             name_suffix: 'fa',
+                             c_args: '-DCONFIG_USER_ONLY',
+                             build_by_default: have_user)
+
+tcg_user = declare_dependency(link_with: libtcg_user,
+                              dependencies: tcg_ss.dependencies())
+user_ss.add(tcg_user)
+
+libtcg_softmmu = static_library('tcg_softmmu',
+                                tcg_ss.sources() + genh,
+                                name_suffix: 'fa',
+                                c_args: '-DCONFIG_SOFTMMU',
+                                build_by_default: have_system)
+
+tcg_softmmu = declare_dependency(link_with: libtcg_softmmu,
+                                 dependencies: tcg_ss.dependencies())
+softmmu_ss.add(tcg_softmmu)
-- 
2.34.1

From: Ilya Leoshkevich <iii@linux.ibm.com>

Coverity complains that perf_marker is never unmapped.
Fix by unmapping it in perf_exit().

Fixes: Coverity CID 1507929
Fixes: 5584e2dbe8c9 ("tcg: add perfmap and jitdump")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Message-Id: <20230605114134.1169974-1-iii@linux.ibm.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/perf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/perf.c
+++ b/accel/tcg/perf.c
@@ -XXX,XX +XXX,XX @@ static void write_perfmap_entry(const void *start, size_t insn,
 }
 
 static FILE *jitdump;
+static size_t perf_marker_size;
+static void *perf_marker = MAP_FAILED;
 
 #define JITHEADER_MAGIC 0x4A695444
 #define JITHEADER_VERSION 1
@@ -XXX,XX +XXX,XX @@ void perf_enable_jitdump(void)
 {
     struct jitheader header;
     char jitdump_file[32];
-    void *perf_marker;
 
     if (!use_rt_clock) {
         warn_report("CLOCK_MONOTONIC is not available, proceeding without jitdump");
@@ -XXX,XX +XXX,XX @@ void perf_enable_jitdump(void)
      * PERF_RECORD_MMAP or PERF_RECORD_MMAP2 event is of the form jit-%d.dump
      * and will process it as a jitdump file.
      */
-    perf_marker = mmap(NULL, qemu_real_host_page_size(), PROT_READ | PROT_EXEC,
+    perf_marker_size = qemu_real_host_page_size();
+    perf_marker = mmap(NULL, perf_marker_size, PROT_READ | PROT_EXEC,
                        MAP_PRIVATE, fileno(jitdump), 0);
     if (perf_marker == MAP_FAILED) {
         warn_report("Could not map %s: %s, proceeding without jitdump",
@@ -XXX,XX +XXX,XX @@ void perf_exit(void)
         perfmap = NULL;
     }
 
+    if (perf_marker != MAP_FAILED) {
+        munmap(perf_marker, perf_marker_size);
+        perf_marker = MAP_FAILED;
+    }
+
     if (jitdump) {
         fclose(jitdump);
         jitdump = NULL;
-- 
2.34.1

From: Philippe Mathieu-Daudé <philmd@linaro.org>

In commit d56fea79f9 ("tcg: Move TCG_{LOW,HIGH} to tcg-internal.h")
we replaced the "_link_error" definitions with modern QEMU_ERROR()
attribute markup. We covered tcg-op.c but forgot to completely
clean tcg-op-vec.c. Do it now.

Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-Id: <20230605175647.88395-3-philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-vec.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@
 #include "tcg/tcg-mo.h"
 #include "tcg-internal.h"
 
-
-/* Reduce the number of ifdefs below.  This assumes that all uses of
-   TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
-   the compiler can eliminate.  */
-#if TCG_TARGET_REG_BITS == 64
-extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
-extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
-#define TCGV_LOW  TCGV_LOW_link_error
-#define TCGV_HIGH TCGV_HIGH_link_error
-#endif
-
 /*
  * Vector optional opcode tracking.
  * Except for the basic logical operations (and, or, xor), and
-- 
2.34.1

The following changes since commit aa3a285b5bc56a4208b3b57d4a55291e9c260107:

Merge tag 'mem-2024-12-21' of https://github.com/davidhildenbrand/qemu into staging (2024-12-22 14:33:27 -0500)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241224

for you to fetch changes up to e4a8e093dc74be049f4829831dce76e5edab0003:

accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core (2024-12-24 08:32:15 -0800)

----------------------------------------------------------------
tcg/optimize: Remove in-flight mask data from OptContext
fpu: Add float*_muladd_scalbn
fpu: Remove float_muladd_halve_result
fpu: Add float_round_nearest_even_max
fpu: Add float_muladd_suppress_add_product_zero
target/hexagon: Use float32_muladd
accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core

----------------------------------------------------------------
Ilya Leoshkevich (1):
      tests/tcg: Do not use inttypes.h in multiarch/system/memory.c

Pierrick Bouvier (1):
      plugins: optimize cpu_index code generation

Richard Henderson (70):
      tcg/optimize: Split out finish_bb, finish_ebb
      tcg/optimize: Split out fold_affected_mask
      tcg/optimize: Copy mask writeback to fold_masks
      tcg/optimize: Split out fold_masks_zs
      tcg/optimize: Augment s_mask from z_mask in fold_masks_zs
      tcg/optimize: Change representation of s_mask
      tcg/optimize: Use finish_folding in fold_add, fold_add_vec, fold_addsub2
      tcg/optimize: Introduce const value accessors for TempOptInfo
      tcg/optimize: Use fold_masks_zs in fold_and
      tcg/optimize: Use fold_masks_zs in fold_andc
      tcg/optimize: Use fold_masks_zs in fold_bswap
      tcg/optimize: Use fold_masks_zs in fold_count_zeros
      tcg/optimize: Use fold_masks_z in fold_ctpop
      tcg/optimize: Use fold_and and fold_masks_z in fold_deposit
      tcg/optimize: Compute sign mask in fold_deposit
      tcg/optimize: Use finish_folding in fold_divide
      tcg/optimize: Use finish_folding in fold_dup, fold_dup2
      tcg/optimize: Use fold_masks_s in fold_eqv
      tcg/optimize: Use fold_masks_z in fold_extract
      tcg/optimize: Use finish_folding in fold_extract2
      tcg/optimize: Use fold_masks_zs in fold_exts
      tcg/optimize: Use fold_masks_z in fold_extu
      tcg/optimize: Use fold_masks_zs in fold_movcond
      tcg/optimize: Use finish_folding in fold_mul*
      tcg/optimize: Use fold_masks_s in fold_nand
      tcg/optimize: Use fold_masks_z in fold_neg_no_const
      tcg/optimize: Use fold_masks_s in fold_nor
      tcg/optimize: Use fold_masks_s in fold_not
      tcg/optimize: Use fold_masks_zs in fold_or
      tcg/optimize: Use fold_masks_zs in fold_orc
      tcg/optimize: Use fold_masks_zs in fold_qemu_ld
      tcg/optimize: Return true from fold_qemu_st, fold_tcg_st
      tcg/optimize: Use finish_folding in fold_remainder
      tcg/optimize: Distinguish simplification in fold_setcond_zmask
      tcg/optimize: Use fold_masks_z in fold_setcond
      tcg/optimize: Use fold_masks_s in fold_negsetcond
      tcg/optimize: Use fold_masks_z in fold_setcond2
      tcg/optimize: Use finish_folding in fold_cmp_vec
      tcg/optimize: Use finish_folding in fold_cmpsel_vec
      tcg/optimize: Use fold_masks_zs in fold_sextract
      tcg/optimize: Use fold_masks_zs, fold_masks_s in fold_shift
      tcg/optimize: Simplify sign bit test in fold_shift
      tcg/optimize: Use finish_folding in fold_sub, fold_sub_vec
      tcg/optimize: Use fold_masks_zs in fold_tcg_ld
      tcg/optimize: Use finish_folding in fold_tcg_ld_memcopy
      tcg/optimize: Use fold_masks_zs in fold_xor
      tcg/optimize: Use finish_folding in fold_bitsel_vec
      tcg/optimize: Use finish_folding as default in tcg_optimize
      tcg/optimize: Remove z_mask, s_mask from OptContext
      tcg/optimize: Re-enable sign-mask optimizations
      tcg/optimize: Move fold_bitsel_vec into alphabetic sort
      tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
      softfloat: Add float{16,32,64}_muladd_scalbn
      target/arm: Use float*_muladd_scalbn
      target/sparc: Use float*_muladd_scalbn
      softfloat: Remove float_muladd_halve_result
      softfloat: Add float_round_nearest_even_max
      softfloat: Add float_muladd_suppress_add_product_zero
      target/hexagon: Use float32_mul in helper_sfmpy
      target/hexagon: Use float32_muladd for helper_sffma
      target/hexagon: Use float32_muladd for helper_sffms
      target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
      target/hexagon: Use float32_muladd for helper_sffm[as]_lib
      target/hexagon: Remove internal_fmafx
      target/hexagon: Expand GEN_XF_ROUND
      target/hexagon: Remove Float
      target/hexagon: Remove Double
      target/hexagon: Use mulu64 for int128_mul_6464
      target/hexagon: Simplify internal_mpyhh setup
      accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core

From: Ilya Leoshkevich <iii@linux.ibm.com>

make check-tcg fails on Fedora with the following error message:

alpha-linux-gnu-gcc [...] qemu/tests/tcg/multiarch/system/memory.c -o memory [...]
    qemu/tests/tcg/multiarch/system/memory.c:17:10: fatal error: inttypes.h: No such file or directory
       17 | #include <inttypes.h>
          |          ^~~~~~~~~~~~
    compilation terminated.

The reason is that Fedora has cross-compilers, but no cross-glibc
headers. Fix by hardcoding the format specifiers and dropping the
include.

An alternative fix would be to introduce a configure check for
inttypes.h. But this would make it impossible to use Fedora
cross-compilers for softmmu tests, which used to work so far.

Fixes: ecbcc9ead2f8 ("tests/tcg: add a system test to check memory instrumentation")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010085906.226249-1-iii@linux.ibm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/multiarch/system/memory.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/tcg/multiarch/system/memory.c b/tests/tcg/multiarch/system/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/tcg/multiarch/system/memory.c
+++ b/tests/tcg/multiarch/system/memory.c
@@ -XXX,XX +XXX,XX @@
 
 #include <stdint.h>
 #include <stdbool.h>
-#include <inttypes.h>
 #include <minilib.h>
 
 #ifndef CHECK_UNALIGNED
@@ -XXX,XX +XXX,XX @@ int main(void)
     int i;
     bool ok = true;
 
-    ml_printf("Test data start: 0x%"PRIxPTR"\n", &test_data[0]);
-    ml_printf("Test data end: 0x%"PRIxPTR"\n", &test_data[TEST_SIZE]);
+    ml_printf("Test data start: 0x%lx\n", (unsigned long)&test_data[0]);
+    ml_printf("Test data end: 0x%lx\n", (unsigned long)&test_data[TEST_SIZE]);
 
     /* Run through the unsigned tests first */
     for (i = 0; i < ARRAY_SIZE(init_ufns) && ok; i++) {
@@ -XXX,XX +XXX,XX @@ int main(void)
         ok = do_signed_reads(true);
     }
 
-    ml_printf("Test data read: %"PRId32"\n", test_read_count);
-    ml_printf("Test data write: %"PRId32"\n", test_write_count);
+    ml_printf("Test data read: %lu\n", (unsigned long)test_read_count);
+    ml_printf("Test data write: %lu\n", (unsigned long)test_write_count);
     ml_printf("Test complete: %s\n", ok ? "PASSED" : "FAILED");
     return ok ? 0 : -1;
 }
-- 
2.43.0

From: Pierrick Bouvier <pierrick.bouvier@linaro.org>

When running with a single vcpu, we can return a constant instead of a
load when accessing cpu_index.
A side effect is that all tcg operations using it are optimized, most
notably scoreboard access.
When running a simple loop in user-mode, the speedup is around 20%.

Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-ID: <20241128213843.1023080-1-pierrick.bouvier@linaro.org>
---
 accel/tcg/plugin-gen.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static void gen_disable_mem_helper(void)
 
 static TCGv_i32 gen_cpu_index(void)
 {
+    /*
+     * Optimize when we run with a single vcpu. All values using cpu_index,
+     * including scoreboard index, will be optimized out.
+     * User-mode calls tb_flush when setting this flag. In system-mode, all
+     * vcpus are created before generating code.
+     */
+    if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) {
+        return tcg_constant_i32(current_cpu->cpu_index);
+    }
     TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
     tcg_gen_ld_i32(cpu_index, tcg_env,
                    -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
-- 
2.43.0

Call them directly from the opcode switch statement in tcg_optimize,
rather than in finish_folding based on opcode flags.  Adjust folding
of conditional branches to match.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_bb(OptContext *ctx)
+{
+    /* We only optimize memory barriers across basic blocks. */
+    ctx->prev_mb = NULL;
+}
+
+static void finish_ebb(OptContext *ctx)
+{
+    finish_bb(ctx);
+    /* We only optimize across extended basic blocks. */
+    memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+    remove_mem_copy_all(ctx);
+}
+
 static void finish_folding(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     int i, nb_oargs;
 
-    /*
-     * We only optimize extended basic blocks.  If the opcode ends a BB
-     * and is not a conditional branch, reset all temp data.
-     */
-    if (def->flags & TCG_OPF_BB_END) {
-        ctx->prev_mb = NULL;
-        if (!(def->flags & TCG_OPF_COND_BRANCH)) {
-            memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
-            remove_mem_copy_all(ctx);
-        }
-        return;
-    }
-
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
         TCGTemp *ts = arg_temp(op->args[i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
     if (i > 0) {
         op->opc = INDEX_op_br;
         op->args[0] = op->args[3];
+        finish_ebb(ctx);
+    } else {
+        finish_bb(ctx);
     }
-    return false;
+    return true;
 }
 
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
         }
         op->opc = INDEX_op_br;
         op->args[0] = label;
-        break;
+        finish_ebb(ctx);
+        return true;
     }
-    return false;
+
+    finish_bb(ctx);
+    return true;
 }
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        case INDEX_op_set_label:
+        case INDEX_op_br:
+        case INDEX_op_exit_tb:
+        case INDEX_op_goto_tb:
+        case INDEX_op_goto_ptr:
+            finish_ebb(&ctx);
+            done = true;
+            break;
         default:
             break;
         }
-- 
2.43.0

There are only a few logical operations which can compute
an "affected" mask.  Split out handling of this optimization
to a separate function, only to be called when applicable.

Remove the a_mask field from OptContext, as the mask is
no longer stored anywhere.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
 
     /* In flight values from optimization. */
-    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
-    uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
     uint64_t s_mask = ctx->s_mask;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
         s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (z_mask == 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
     }
+    return false;
+}
+
+/*
+ * An "affected" mask bit is 0 if and only if the result is identical
+ * to the first input.  Thus if the entire mask is 0, the operation
+ * is equivalent to a copy.
+ */
+static bool fold_affected_mask(OptContext *ctx, TCGOp *op, uint64_t a_mask)
+{
+    if (ctx->type == TCG_TYPE_I32) {
+        a_mask = (uint32_t)a_mask;
+    }
     if (a_mask == 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
      */
-    if (arg_is_const(op->args[2])) {
-        ctx->a_mask = z1 & ~z2;
+    if (arg_is_const(op->args[2]) &&
+        fold_affected_mask(ctx, op, z1 & ~z2)) {
+        return true;
     }
 
     return fold_masks(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
      */
     if (arg_is_const(op->args[2])) {
         uint64_t z2 = ~arg_info(op->args[2])->z_mask;
-        ctx->a_mask = z1 & ~z2;
+        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
+            return true;
+        }
         z1 &= z2;
     }
     ctx->z_mask = z1;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
 
     z_mask_old = arg_info(op->args[1])->z_mask;
     z_mask = extract64(z_mask_old, pos, len);
-    if (pos == 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
+    if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+        return true;
     }
     ctx->z_mask = z_mask;
     ctx->s_mask = smask_from_zmask(z_mask);
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = z_mask;
     ctx->s_mask = s_mask;
-    if (!type_change) {
-        ctx->a_mask = s_mask & ~s_mask_old;
+    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+        return true;
     }
 
     return fold_masks(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = z_mask;
     ctx->s_mask = smask_from_zmask(z_mask);
-    if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
+    if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+        return true;
     }
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
     ctx->s_mask = s_mask;
 
-    if (pos == 0) {
-        ctx->a_mask = s_mask & ~s_mask_old;
+    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+        return true;
     }
 
     return fold_masks(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Assume all bits affected, no bits known zero, no sign reps. */
-        ctx.a_mask = -1;
         ctx.z_mask = -1;
         ctx.s_mask = 0;
 
-- 
2.43.0

Use of fold_masks should be restricted to those opcodes that
can reliably make use of it -- those with a single output,
and from higher-level folders that set up the masks.
Prepare for conversion of each folder in turn.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask = ctx->z_mask;
     uint64_t s_mask = ctx->s_mask;
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    TCGTemp *ts;
+    TempOptInfo *ti;
+
+    /* Only single-output opcodes are supported here. */
+    tcg_debug_assert(def->nb_oargs == 1);
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         z_mask = (int32_t)z_mask;
         s_mask |= MAKE_64BIT_MASK(32, 32);
-        ctx->z_mask = z_mask;
-        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
     }
-    return false;
+
+    ts = arg_temp(op->args[0]);
+    reset_ts(ctx, ts);
+
+    ti = ts_info(ts);
+    ti->z_mask = z_mask;
+    ti->s_mask = s_mask;
+    return true;
 }
 
 /*
-- 
2.43.0

Add a routine to which masks can be passed directly, rather than
storing them into OptContext.  To be used in upcoming patches.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
-static bool fold_masks(OptContext *ctx, TCGOp *op)
+/*
+ * Record "zero" and "sign" masks for the single output of @op.
+ * See TempOptInfo definition of z_mask and s_mask.
+ * If z_mask allows, fold the output to constant zero.
+ */
+static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
+                          uint64_t z_mask, uint64_t s_mask)
 {
-    uint64_t z_mask = ctx->z_mask;
-    uint64_t s_mask = ctx->s_mask;
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGTemp *ts;
     TempOptInfo *ti;
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
+}
+
 /*
  * An "affected" mask bit is 0 if and only if the result is identical
  * to the first input.  Thus if the entire mask is 0, the operation
-- 
2.43.0

Consider the passed s_mask to be a minimum deduced from
either existing s_mask or from a sign-extension operation.
We may be able to deduce more from the set of known zeros.
Remove identical logic from several opcode folders.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
  * Record "zero" and "sign" masks for the single output of @op.
  * See TempOptInfo definition of z_mask and s_mask.
  * If z_mask allows, fold the output to constant zero.
+ * The passed s_mask may be augmented by z_mask.
  */
 static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
                           uint64_t z_mask, uint64_t s_mask)
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
 
     ti = ts_info(ts);
     ti->z_mask = z_mask;
-    ti->s_mask = s_mask;
+    ti->s_mask = s_mask | smask_from_zmask(z_mask);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
-    s_mask = smask_from_zmask(z_mask);
 
+    s_mask = 0;
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
         break;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
-        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         g_assert_not_reached();
     }
     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         return true;
     }
     ctx->z_mask = z_mask;
-    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
-    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     int width = 8 * memop_size(mop);
 
     if (width < 64) {
-        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
-        if (!(mop & MO_SIGN)) {
+        if (mop & MO_SIGN) {
+            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        } else {
             ctx->z_mask = MAKE_64BIT_MASK(0, width);
-            ctx->s_mask <<= 1;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     fold_setcond_tst_pow2(ctx, op, false);
 
     ctx->z_mask = 1;
-    ctx->s_mask = smask_from_zmask(1);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = 1;
-    ctx->s_mask = smask_from_zmask(1);
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
         break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
-        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
         break;
     CASE_OP_32_64(ld16s):
         ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
-        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
         break;
     case INDEX_op_ld32s_i64:
         ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
-        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
-- 
2.43.0

Change the representation from sign bit repetitions to all bits equal
to the sign bit, including the sign bit itself.

The previous format has a problem in that it is difficult to recreate
a valid sign mask after a shift operation: the "repetitions" part of
the previous format meant that applying the same shift as for the value
lead to an off-by-one value.

The new format, including the sign bit itself, means that the sign mask
can be manipulated in exactly the same way as the value, canonicalization
is easier.

Canonicalize the s_mask in fold_masks_zs, rather than requiring callers
to do so.  Treat 0 as a non-canonical but typeless input for no sign
information, which will be reset as appropriate for the data type.
We can easily fold in the data from z_mask while canonicalizing.

Temporarily disable optimizations using s_mask while each operation is
converted to use fold_masks_zs and to the new form.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 64 ++++++++++++--------------------------------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
-    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
+    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-    uint64_t s_mask;  /* mask of clrsb(value) bits */
+    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
     TCGType type;
 } OptContext;
 
-/* Calculate the smask for a specific value. */
-static uint64_t smask_from_value(uint64_t value)
-{
-    int rep = clrsb64(value);
-    return ~(~0ull >> rep);
-}
-
-/*
- * Calculate the smask for a given set of known-zeros.
- * If there are lots of zeros on the left, we can consider the remainder
- * an unsigned field, and thus the corresponding signed field is one bit
- * larger.
- */
-static uint64_t smask_from_zmask(uint64_t zmask)
-{
-    /*
-     * Only the 0 bits are significant for zmask, thus the msb itself
-     * must be zero, else we have no sign information.
-     */
-    int rep = clz64(zmask);
-    if (rep == 0) {
-        return 0;
-    }
-    rep -= 1;
-    return ~(~0ull >> rep);
-}
-
-/*
- * Recreate a properly left-aligned smask after manipulation.
- * Some bit-shuffling, particularly shifts and rotates, may
- * retain sign bits on the left, but may scatter disconnected
- * sign bits on the right.  Retain only what remains to the left.
- */
-static uint64_t smask_from_smask(int64_t smask)
-{
-    /* Only the 1 bits are significant for smask */
-    return smask_from_zmask(~smask);
-}
-
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        ti->s_mask = smask_from_value(ts->val);
+        ti->s_mask = INT64_MIN >> clrsb64(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
          */
         if (i == 0) {
             ts_info(ts)->z_mask = ctx->z_mask;
-            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
  * The passed s_mask may be augmented by z_mask.
  */
 static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
-                          uint64_t z_mask, uint64_t s_mask)
+                          uint64_t z_mask, int64_t s_mask)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGTemp *ts;
     TempOptInfo *ti;
+    int rep;
 
     /* Only single-output opcodes are supported here. */
     tcg_debug_assert(def->nb_oargs == 1);
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
      */
     if (ctx->type == TCG_TYPE_I32) {
         z_mask = (int32_t)z_mask;
-        s_mask |= MAKE_64BIT_MASK(32, 32);
+        s_mask |= INT32_MIN;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
 
     ti = ts_info(ts);
     ti->z_mask = z_mask;
-    ti->s_mask = s_mask | smask_from_zmask(z_mask);
+
+    /* Canonicalize s_mask and incorporate data from z_mask. */
+    rep = clz64(~s_mask);
+    rep = MAX(rep, clz64(z_mask));
+    rep = MAX(rep - 1, 0);
+    ti->s_mask = INT64_MIN >> rep;
+
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = z_mask;
     ctx->s_mask = s_mask;
-    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
     ctx->s_mask = s_mask;
 
-    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 
         s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
-        ctx->s_mask = smask_from_smask(s_mask);
 
         return fold_masks(ctx, op);
     }
-- 
2.43.0

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_ebb(OptContext *ctx)
     remove_mem_copy_all(ctx);
 }
 
-static void finish_folding(OptContext *ctx, TCGOp *op)
+static bool finish_folding(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     int i, nb_oargs;
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
             ts_info(ts)->z_mask = ctx->z_mask;
         }
     }
+    return true;
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 /* We cannot as yet do_constant_folding with vectors. */
@@ -XXX,XX +XXX,XX @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
         op->args[4] = arg_new_constant(ctx, bl);
         op->args[5] = arg_new_constant(ctx, bh);
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Introduce ti_is_const, ti_const_val, ti_is_const_val.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static inline TempOptInfo *arg_info(TCGArg arg)
     return ts_info(arg_temp(arg));
 }
 
+static inline bool ti_is_const(TempOptInfo *ti)
+{
+    return ti->is_const;
+}
+
+static inline uint64_t ti_const_val(TempOptInfo *ti)
+{
+    return ti->val;
+}
+
+static inline bool ti_is_const_val(TempOptInfo *ti, uint64_t val)
+{
+    return ti_is_const(ti) && ti_const_val(ti) == val;
+}
+
 static inline bool ts_is_const(TCGTemp *ts)
 {
-    return ts_info(ts)->is_const;
+    return ti_is_const(ts_info(ts));
 }
 
 static inline bool ts_is_const_val(TCGTemp *ts, uint64_t val)
 {
-    TempOptInfo *ti = ts_info(ts);
-    return ti->is_const && ti->val == val;
+    return ti_is_const_val(ts_info(ts), val);
 }
 
 static inline bool arg_is_const(TCGArg arg)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Sink mask computation below fold_affected_mask early exit.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add2(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z1, z2;
+    uint64_t z1, z2, z_mask, s_mask;
+    TempOptInfo *t1, *t2;
 
     if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    z1 = arg_info(op->args[1])->z_mask;
-    z2 = arg_info(op->args[2])->z_mask;
-    ctx->z_mask = z1 & z2;
-
-    /*
-     * Sign repetitions are perforce all identical, whether they are 1 or 0.
-     * Bitwise operations preserve the relative quantity of the repetitions.
-     */
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z1 = t1->z_mask;
+    z2 = t2->z_mask;
 
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
      */
-    if (arg_is_const(op->args[2]) &&
-        fold_affected_mask(ctx, op, z1 & ~z2)) {
+    if (ti_is_const(t2) && fold_affected_mask(ctx, op, z1 & ~z2)) {
         return true;
     }
 
-    return fold_masks(ctx, op);
+    z_mask = z1 & z2;
+
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    s_mask = t1->s_mask & t2->s_mask;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Avoid double inversion of the value of second const operand.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z1;
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1, *t2;
 
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    z1 = arg_info(op->args[1])->z_mask;
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z_mask = t1->z_mask;
 
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer anything from it.
      */
-    if (arg_is_const(op->args[2])) {
-        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
-        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
+    if (ti_is_const(t2)) {
+        uint64_t v2 = ti_const_val(t2);
+        if (fold_affected_mask(ctx, op, z_mask & v2)) {
             return true;
         }
-        z1 &= z2;
+        z_mask &= ~v2;
     }
-    ctx->z_mask = z1;
 
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return fold_masks(ctx, op);
+    s_mask = t1->s_mask & t2->s_mask;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Always set s_mask along the BSWAP_OS path, since the result is
being explicitly sign-extended.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask, sign;
+    TempOptInfo *t1 = arg_info(op->args[1]);
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t = arg_info(op->args[1])->val;
-
-        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    if (ti_is_const(t1)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                do_constant_folding(op->opc, ctx->type,
+                                                    ti_const_val(t1),
+                                                    op->args[2]));
     }
 
-    z_mask = arg_info(op->args[1])->z_mask;
-
+    z_mask = t1->z_mask;
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
-            s_mask = sign << 1;
         }
+        /* The value and therefore s_mask is explicitly sign-extended. */
+        s_mask = sign;
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
         break;
     }
-    ctx->z_mask = z_mask;
-    ctx->s_mask = s_mask;
 
-    return fold_masks(ctx, op);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots. Find TempOptInfo once.
Compute s_mask from the union of the maximum count and the
op2 fallback for op1 being zero.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask;
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1 = arg_info(op->args[1]);
+    TempOptInfo *t2 = arg_info(op->args[2]);
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t = arg_info(op->args[1])->val;
+    if (ti_is_const(t1)) {
+        uint64_t t = ti_const_val(t1);
 
         if (t != 0) {
             t = do_constant_folding(op->opc, ctx->type, t, 0);
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
-    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
-    return false;
+    s_mask = ~z_mask;
+    z_mask |= t2->z_mask;
+    s_mask &= t2->s_mask;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Add fold_masks_z as a trivial wrapper around fold_masks_zs.
Avoid the use of the OptContext slots.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
     return true;
 }
 
+static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
+{
+    return fold_masks_zs(ctx, op, z_mask, 0);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
 
     switch (ctx->type) {
     case TCG_TYPE_I32:
-        ctx->z_mask = 32 | 31;
+        z_mask = 32 | 31;
         break;
     case TCG_TYPE_I64:
-        ctx->z_mask = 64 | 63;
+        z_mask = 64 | 63;
         break;
     default:
         g_assert_not_reached();
     }
-    return false;
+    return fold_masks_z(ctx, op, z_mask);
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
When we fold to and, use fold_and.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
 {
+    TempOptInfo *t1 = arg_info(op->args[1]);
+    TempOptInfo *t2 = arg_info(op->args[2]);
+    int ofs = op->args[3];
+    int len = op->args[4];
     TCGOpcode and_opc;
+    uint64_t z_mask;
 
-    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-        uint64_t t1 = arg_info(op->args[1])->val;
-        uint64_t t2 = arg_info(op->args[2])->val;
-
-        t1 = deposit64(t1, op->args[3], op->args[4], t2);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    if (ti_is_const(t1) && ti_is_const(t2)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                deposit64(ti_const_val(t1), ofs, len,
+                                          ti_const_val(t2)));
     }
 
     switch (ctx->type) {
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
     }
 
     /* Inserting a value into zero at offset 0. */
-    if (arg_is_const_val(op->args[1], 0) && op->args[3] == 0) {
-        uint64_t mask = MAKE_64BIT_MASK(0, op->args[4]);
+    if (ti_is_const_val(t1, 0) && ofs == 0) {
+        uint64_t mask = MAKE_64BIT_MASK(0, len);
 
         op->opc = and_opc;
         op->args[1] = op->args[2];
         op->args[2] = arg_new_constant(ctx, mask);
-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
-        return false;
+        return fold_and(ctx, op);
     }
 
     /* Inserting zero into a value. */
-    if (arg_is_const_val(op->args[2], 0)) {
-        uint64_t mask = deposit64(-1, op->args[3], op->args[4], 0);
+    if (ti_is_const_val(t2, 0)) {
+        uint64_t mask = deposit64(-1, ofs, len, 0);
 
         op->opc = and_opc;
         op->args[2] = arg_new_constant(ctx, mask);
-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
-        return false;
+        return fold_and(ctx, op);
     }
 
-    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                            op->args[3], op->args[4],
-                            arg_info(op->args[2])->z_mask);
-    return false;
+    z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
+    return fold_masks_z(ctx, op, z_mask);
 }
 
 static bool fold_divide(OptContext *ctx, TCGOp *op)
-- 
2.43.0

The input which overlaps the sign bit of the output can
have its input s_mask propagated to the output s_mask.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
     TempOptInfo *t2 = arg_info(op->args[2]);
     int ofs = op->args[3];
     int len = op->args[4];
+    int width;
     TCGOpcode and_opc;
-    uint64_t z_mask;
+    uint64_t z_mask, s_mask;
 
     if (ti_is_const(t1) && ti_is_const(t2)) {
         return tcg_opt_gen_movi(ctx, op, op->args[0],
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
     switch (ctx->type) {
     case TCG_TYPE_I32:
         and_opc = INDEX_op_and_i32;
+        width = 32;
         break;
     case TCG_TYPE_I64:
         and_opc = INDEX_op_and_i64;
+        width = 64;
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         return fold_and(ctx, op);
     }
 
+    /* The s_mask from the top portion of the deposit is still valid. */
+    if (ofs + len == width) {
+        s_mask = t2->s_mask << ofs;
+    } else {
+        s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len);
+    }
+
     z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
-    return fold_masks_z(ctx, op, z_mask);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_divide(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Add fold_masks_s as a trivial wrapper around fold_masks_zs.
Avoid the use of the OptContext slots.

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
     return fold_masks_zs(ctx, op, z_mask, 0);
 }
 
+static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
+{
+    return fold_masks_zs(ctx, op, -1, s_mask);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask;
+
     if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
 
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return false;
+    s_mask = arg_info(op->args[1])->s_mask
+           & arg_info(op->args[2])->s_mask;
+    return fold_masks_s(ctx, op, s_mask);
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    TempOptInfo *t1 = arg_info(op->args[1]);
     int pos = op->args[2];
     int len = op->args[3];
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t;
-
-        t = arg_info(op->args[1])->val;
-        t = extract64(t, pos, len);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    if (ti_is_const(t1)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                extract64(ti_const_val(t1), pos, len));
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask_old = t1->z_mask;
     z_mask = extract64(z_mask_old, pos, len);
     if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
         return true;
     }
-    ctx->z_mask = z_mask;
 
-    return fold_masks(ctx, op);
+    return fold_masks_z(ctx, op, z_mask);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Explicitly sign-extend z_mask instead of doing that manually.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t s_mask_old, s_mask, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask;
     bool type_change = false;
+    TempOptInfo *t1;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask = arg_info(op->args[1])->z_mask;
-    s_mask = arg_info(op->args[1])->s_mask;
+    t1 = arg_info(op->args[1]);
+    z_mask = t1->z_mask;
+    s_mask = t1->s_mask;
     s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
-        sign = INT8_MIN;
-        z_mask = (uint8_t)z_mask;
+        s_mask |= INT8_MIN;
+        z_mask = (int8_t)z_mask;
         break;
     CASE_OP_32_64(ext16s):
-        sign = INT16_MIN;
-        z_mask = (uint16_t)z_mask;
+        s_mask |= INT16_MIN;
+        z_mask = (int16_t)z_mask;
         break;
     case INDEX_op_ext_i32_i64:
         type_change = true;
         QEMU_FALLTHROUGH;
     case INDEX_op_ext32s_i64:
-        sign = INT32_MIN;
-        z_mask = (uint32_t)z_mask;
+        s_mask |= INT32_MIN;
+        z_mask = (int32_t)z_mask;
         break;
     default:
         g_assert_not_reached();
     }
 
-    if (z_mask & sign) {
-        z_mask |= sign;
-    }
-    s_mask |= sign << 1;
-
-    ctx->z_mask = z_mask;
-    ctx->s_mask = s_mask;
     if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
-    return fold_masks(ctx, op);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, s_mask;
+    TempOptInfo *tt, *ft;
     int i;
 
     /* If true and false values are the same, eliminate the cmp. */
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
-    ctx->z_mask = arg_info(op->args[3])->z_mask
-                | arg_info(op->args[4])->z_mask;
-    ctx->s_mask = arg_info(op->args[3])->s_mask
-                & arg_info(op->args[4])->s_mask;
+    tt = arg_info(op->args[3]);
+    ft = arg_info(op->args[4]);
+    z_mask = tt->z_mask | ft->z_mask;
+    s_mask = tt->s_mask & ft->s_mask;
 
-    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-        uint64_t tv = arg_info(op->args[3])->val;
-        uint64_t fv = arg_info(op->args[4])->val;
+    if (ti_is_const(tt) && ti_is_const(ft)) {
+        uint64_t tv = ti_const_val(tt);
+        uint64_t fv = ti_const_val(ft);
         TCGOpcode opc, negopc = 0;
         TCGCond cond = op->args[5];
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
             }
         }
     }
-    return false;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 1)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
         tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1, *t2;
+
     if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
 
-    ctx->z_mask = arg_info(op->args[1])->z_mask
-                | arg_info(op->args[2])->z_mask;
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return fold_masks(ctx, op);
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z_mask = t1->z_mask | t2->z_mask;
+    s_mask = t1->s_mask & t2->s_mask;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, -1) ||
         fold_xi_to_x(ctx, op, -1) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return false;
+    s_mask = arg_info(op->args[1])->s_mask
+           & arg_info(op->args[2])->s_mask;
+    return fold_masks_s(ctx, op, s_mask);
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.

Be careful not to call fold_masks_zs when the memory operation
is wide enough to require multiple outputs, so split into two
functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
     return fold_masks_s(ctx, op, s_mask);
 }
 
-static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
+    uint64_t z_mask = -1, s_mask = 0;
 
     if (width < 64) {
         if (mop & MO_SIGN) {
-            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+            s_mask = MAKE_64BIT_MASK(width - 1, 64 - (width - 1));
         } else {
-            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            z_mask = MAKE_64BIT_MASK(0, width);
         }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
-    return false;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
+}
+
+static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         case INDEX_op_qemu_ld_a32_i32:
         case INDEX_op_qemu_ld_a64_i32:
+            done = fold_qemu_ld_1reg(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_a32_i64:
         case INDEX_op_qemu_ld_a64_i64:
+            if (TCG_TARGET_REG_BITS == 64) {
+                done = fold_qemu_ld_1reg(&ctx, op);
+                break;
+            }
+            QEMU_FALLTHROUGH;
         case INDEX_op_qemu_ld_a32_i128:
         case INDEX_op_qemu_ld_a64_i128:
-            done = fold_qemu_ld(&ctx, op);
+            done = fold_qemu_ld_2reg(&ctx, op);
             break;
         case INDEX_op_qemu_st8_a32_i32:
         case INDEX_op_qemu_st8_a64_i32:
-- 
2.43.0

Stores have no output operands, and so need no further work.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
-    return false;
+    return true;
 }
 
 static bool fold_remainder(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
 
     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
         remove_mem_copy_all(ctx);
-        return false;
+        return true;
     }
 
     switch (op->opc) {
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
         g_assert_not_reached();
     }
     remove_mem_copy_in(ctx, ofs, ofs + lm1);
-    return false;
+    return true;
 }
 
 static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
     TCGType type;
 
     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
-        fold_tcg_st(ctx, op);
-        return false;
+        return fold_tcg_st(ctx, op);
     }
 
     src = arg_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
     last = ofs + tcg_type_size(type) - 1;
     remove_mem_copy_in(ctx, ofs, last);
     record_mem_copy(ctx, type, src, ofs, last);
-    return false;
+    return true;
 }
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Change return from bool to int; distinguish between
complete folding, simplification, and no change.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return finish_folding(ctx, op);
 }
 
-static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
+/* Return 1 if finished, -1 if simplified, 0 if unchanged. */
+static int fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
 {
     uint64_t a_zmask, b_val;
     TCGCond cond;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
                 op->opc = xor_opc;
                 op->args[2] = arg_new_constant(ctx, 1);
             }
-            return false;
+            return -1;
         }
     }
-
-    return false;
+    return 0;
 }
 
 static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg)
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
 
-    if (fold_setcond_zmask(ctx, op, false)) {
+    i = fold_setcond_zmask(ctx, op, false);
+    if (i > 0) {
         return true;
     }
-    fold_setcond_tst_pow2(ctx, op, false);
+    if (i == 0) {
+        fold_setcond_tst_pow2(ctx, op, false);
+    }
 
     ctx->z_mask = 1;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_movi(ctx, op, op->args[0], -i);
     }
 
-    if (fold_setcond_zmask(ctx, op, true)) {
+    i = fold_setcond_zmask(ctx, op, true);
+    if (i > 0) {
         return true;
     }
-    fold_setcond_tst_pow2(ctx, op, true);
+    if (i == 0) {
+        fold_setcond_tst_pow2(ctx, op, true);
+    }
 
     /* Value is {0,-1} so all bits are repetitions of the sign. */
     ctx->s_mask = -1;
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask, s_mask_old;
+    TempOptInfo *t1 = arg_info(op->args[1]);
     int pos = op->args[2];
     int len = op->args[3];
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t;
-
-        t = arg_info(op->args[1])->val;
-        t = sextract64(t, pos, len);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    if (ti_is_const(t1)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                sextract64(ti_const_val(t1), pos, len));
     }
 
-    z_mask = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask, pos, len);
-    ctx->z_mask = z_mask;
-
-    s_mask_old = arg_info(op->args[1])->s_mask;
-    s_mask = sextract64(s_mask_old, pos, len);
-    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
-    ctx->s_mask = s_mask;
+    s_mask_old = t1->s_mask;
+    s_mask = s_mask_old >> pos;
+    s_mask |= -1ull << (len - 1);
 
     if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
-    return fold_masks(ctx, op);
+    z_mask = sextract64(t1->z_mask, pos, len);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     uint64_t s_mask, z_mask, sign;
+    TempOptInfo *t1, *t2;
 
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    s_mask = arg_info(op->args[1])->s_mask;
-    z_mask = arg_info(op->args[1])->z_mask;
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    s_mask = t1->s_mask;
+    z_mask = t1->z_mask;
 
-    if (arg_is_const(op->args[2])) {
-        int sh = arg_info(op->args[2])->val;
-
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+    if (ti_is_const(t2)) {
+        int sh = ti_const_val(t2);
 
+        z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
         s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 
-        return fold_masks(ctx, op);
+        return fold_masks_zs(ctx, op, z_mask, s_mask);
     }
 
     switch (op->opc) {
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          * Arithmetic right shift will not reduce the number of
          * input sign repetitions.
          */
-        ctx->s_mask = s_mask;
-        break;
+        return fold_masks_s(ctx, op, s_mask);
     CASE_OP_32_64(shr):
         /*
          * If the sign bit is known zero, then logical right shift
-         * will not reduced the number of input sign repetitions.
+         * will not reduce the number of input sign repetitions.
          */
-        sign = (s_mask & -s_mask) >> 1;
+        sign = -s_mask;
         if (sign && !(z_mask & sign)) {
-            ctx->s_mask = s_mask;
+            return fold_masks_s(ctx, op, s_mask);
         }
         break;
     default:
         break;
     }
 
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Merge the two conditions, sign != 0 && !(z_mask & sign),
by testing ~z_mask & sign.   If sign == 0, the logical and
will produce false.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Duplicate fold_sub_vec into fold_sub instead of calling it,
now that fold_sub_vec always returns true.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) || fold_sub_vec(ctx, op)) {
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
                    ? INDEX_op_add_i32 : INDEX_op_add_i64);
         op->args[2] = arg_new_constant(ctx, -val);
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_sub2(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2(OptContext *ctx, TCGOp *op)
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask = -1, s_mask = 0;
+
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
     CASE_OP_32_64(ld8s):
-        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        s_mask = INT8_MIN;
         break;
     CASE_OP_32_64(ld8u):
-        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        z_mask = MAKE_64BIT_MASK(0, 8);
         break;
     CASE_OP_32_64(ld16s):
-        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
+        s_mask = INT16_MIN;
         break;
     CASE_OP_32_64(ld16u):
-        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        z_mask = MAKE_64BIT_MASK(0, 16);
         break;
     case INDEX_op_ld32s_i64:
-        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
+        s_mask = INT32_MIN;
         break;
     case INDEX_op_ld32u_i64:
-        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        z_mask = MAKE_64BIT_MASK(0, 32);
         break;
     default:
         g_assert_not_reached();
     }
-    return false;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Remove fold_masks as the function becomes unused.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
     return fold_masks_zs(ctx, op, -1, s_mask);
 }
 
-static bool fold_masks(OptContext *ctx, TCGOp *op)
-{
-    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
-}
-
 /*
  * An "affected" mask bit is 0 if and only if the result is identical
  * to the first input.  Thus if the entire mask is 0, the operation
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1, *t2;
+
     if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    ctx->z_mask = arg_info(op->args[1])->z_mask
-                | arg_info(op->args[2])->z_mask;
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return fold_masks(ctx, op);
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z_mask = t1->z_mask | t2->z_mask;
+    s_mask = t1->s_mask & t2->s_mask;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
-- 
2.43.0

All mask setting is now done with parameters via fold_masks_*.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 -------------
 1 file changed, 13 deletions(-)

All instances of s_mask have been converted to the new
representation.  We can now re-enable usage.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
         g_assert_not_reached();
     }
 
-    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
     s_mask = s_mask_old >> pos;
     s_mask |= -1ull << (len - 1);
 
-    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
-- 
2.43.0

The big comment just above says functions should be sorted.
Add forward declarations as needed.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 114 +++++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 55 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  *   3) those that produce information about the result value.
  */
 
+static bool fold_or(OptContext *ctx, TCGOp *op);
+static bool fold_orc(OptContext *ctx, TCGOp *op);
+static bool fold_xor(OptContext *ctx, TCGOp *op);
+
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2_commutative(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
+static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
+{
+    /* If true and false values are the same, eliminate the cmp. */
+    if (args_are_copies(op->args[2], op->args[3])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
+    }
+
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint64_t tv = arg_info(op->args[2])->val;
+        uint64_t fv = arg_info(op->args[3])->val;
+
+        if (tv == -1 && fv == 0) {
+            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+        }
+        if (tv == 0 && fv == -1) {
+            if (TCG_TARGET_HAS_not_vec) {
+                op->opc = INDEX_op_not_vec;
+                return fold_not(ctx, op);
+            } else {
+                op->opc = INDEX_op_xor_vec;
+                op->args[2] = arg_new_constant(ctx, -1);
+                return fold_xor(ctx, op);
+            }
+        }
+    }
+    if (arg_is_const(op->args[2])) {
+        uint64_t tv = arg_info(op->args[2])->val;
+        if (tv == -1) {
+            op->opc = INDEX_op_or_vec;
+            op->args[2] = op->args[3];
+            return fold_or(ctx, op);
+        }
+        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
+            op->opc = INDEX_op_andc_vec;
+            op->args[2] = op->args[1];
+            op->args[1] = op->args[3];
+            return fold_andc(ctx, op);
+        }
+    }
+    if (arg_is_const(op->args[3])) {
+        uint64_t fv = arg_info(op->args[3])->val;
+        if (fv == 0) {
+            op->opc = INDEX_op_and_vec;
+            return fold_and(ctx, op);
+        }
+        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
+            op->opc = INDEX_op_orc_vec;
+            op->args[2] = op->args[1];
+            op->args[1] = op->args[3];
+            return fold_orc(ctx, op);
+        }
+    }
+    return finish_folding(ctx, op);
+}
+
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     int i = do_constant_folding_cond1(ctx, op, NO_DEST, &op->args[0],
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
     return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
-static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
-{
-    /* If true and false values are the same, eliminate the cmp. */
-    if (args_are_copies(op->args[2], op->args[3])) {
-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
-    }
-
-    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint64_t tv = arg_info(op->args[2])->val;
-        uint64_t fv = arg_info(op->args[3])->val;
-
-        if (tv == -1 && fv == 0) {
-            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-        }
-        if (tv == 0 && fv == -1) {
-            if (TCG_TARGET_HAS_not_vec) {
-                op->opc = INDEX_op_not_vec;
-                return fold_not(ctx, op);
-            } else {
-                op->opc = INDEX_op_xor_vec;
-                op->args[2] = arg_new_constant(ctx, -1);
-                return fold_xor(ctx, op);
-            }
-        }
-    }
-    if (arg_is_const(op->args[2])) {
-        uint64_t tv = arg_info(op->args[2])->val;
-        if (tv == -1) {
-            op->opc = INDEX_op_or_vec;
-            op->args[2] = op->args[3];
-            return fold_or(ctx, op);
-        }
-        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
-            op->opc = INDEX_op_andc_vec;
-            op->args[2] = op->args[1];
-            op->args[1] = op->args[3];
-            return fold_andc(ctx, op);
-        }
-    }
-    if (arg_is_const(op->args[3])) {
-        uint64_t fv = arg_info(op->args[3])->val;
-        if (fv == 0) {
-            op->opc = INDEX_op_and_vec;
-            return fold_and(ctx, op);
-        }
-        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
-            op->opc = INDEX_op_orc_vec;
-            op->args[2] = op->args[1];
-            op->args[1] = op->args[3];
-            return fold_orc(ctx, op);
-        }
-    }
-    return finish_folding(ctx, op);
-}
-
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-- 
2.43.0

The big comment just above says functions should be sorted.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 60 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
+{
+    /* Canonicalize the comparison to put immediate second. */
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[3] = tcg_swap_cond(op->args[3]);
+    }
+    return finish_folding(ctx, op);
+}
+
+static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
+{
+    /* If true and false values are the same, eliminate the cmp. */
+    if (args_are_copies(op->args[3], op->args[4])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
+    }
+
+    /* Canonicalize the comparison to put immediate second. */
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = tcg_swap_cond(op->args[5]);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination,
+     * so that the tcg backend can implement "move if true".
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = tcg_invert_cond(op->args[5]);
+    }
+    return finish_folding(ctx, op);
+}
+
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
-static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
-{
-    /* Canonicalize the comparison to put immediate second. */
-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-        op->args[3] = tcg_swap_cond(op->args[3]);
-    }
-    return finish_folding(ctx, op);
-}
-
-static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
-{
-    /* If true and false values are the same, eliminate the cmp. */
-    if (args_are_copies(op->args[3], op->args[4])) {
-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
-    }
-
-    /* Canonicalize the comparison to put immediate second. */
-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-        op->args[5] = tcg_swap_cond(op->args[5]);
-    }
-    /*
-     * Canonicalize the "false" input reg to match the destination,
-     * so that the tcg backend can implement "move if true".
-     */
-    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-        op->args[5] = tcg_invert_cond(op->args[5]);
-    }
-    return finish_folding(ctx, op);
-}
-
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask, s_mask_old;
-- 
2.43.0

We currently have a flag, float_muladd_halve_result, to scale
the result by 2**-1.  Extend this to handle arbitrary scaling.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h   |  6 ++++
 fpu/softfloat.c           | 58 ++++++++++++++++++++++-----------------
 fpu/softfloat-parts.c.inc |  7 +++--
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ float16 float16_add(float16, float16, float_status *status);
 float16 float16_sub(float16, float16, float_status *status);
 float16 float16_mul(float16, float16, float_status *status);
 float16 float16_muladd(float16, float16, float16, int, float_status *status);
+float16 float16_muladd_scalbn(float16, float16, float16,
+                              int, int, float_status *status);
 float16 float16_div(float16, float16, float_status *status);
 float16 float16_scalbn(float16, int, float_status *status);
 float16 float16_min(float16, float16, float_status *status);
@@ -XXX,XX +XXX,XX @@ float32 float32_mul(float32, float32, float_status *status);
 float32 float32_div(float32, float32, float_status *status);
 float32 float32_rem(float32, float32, float_status *status);
 float32 float32_muladd(float32, float32, float32, int, float_status *status);
+float32 float32_muladd_scalbn(float32, float32, float32,
+                              int, int, float_status *status);
 float32 float32_sqrt(float32, float_status *status);
 float32 float32_exp2(float32, float_status *status);
 float32 float32_log2(float32, float_status *status);
@@ -XXX,XX +XXX,XX @@ float64 float64_mul(float64, float64, float_status *status);
 float64 float64_div(float64, float64, float_status *status);
 float64 float64_rem(float64, float64, float_status *status);
 float64 float64_muladd(float64, float64, float64, int, float_status *status);
+float64 float64_muladd_scalbn(float64, float64, float64,
+                              int, int, float_status *status);
 float64 float64_sqrt(float64, float_status *status);
 float64 float64_log2(float64, float_status *status);
 FloatRelation float64_compare(float64, float64, float_status *status);
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
 #define parts_mul(A, B, S) \
     PARTS_GENERIC_64_128(mul, A)(A, B, S)
 
-static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
-                                    FloatParts64 *c, int flags,
-                                    float_status *s);
-static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
-                                      FloatParts128 *c, int flags,
-                                      float_status *s);
+static FloatParts64 *parts64_muladd_scalbn(FloatParts64 *a, FloatParts64 *b,
+                                           FloatParts64 *c, int scale,
+                                           int flags, float_status *s);
+static FloatParts128 *parts128_muladd_scalbn(FloatParts128 *a, FloatParts128 *b,
+                                             FloatParts128 *c, int scale,
+                                             int flags, float_status *s);
 
-#define parts_muladd(A, B, C, Z, S) \
-    PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
+#define parts_muladd_scalbn(A, B, C, Z, Y, S) \
+    PARTS_GENERIC_64_128(muladd_scalbn, A)(A, B, C, Z, Y, S)
 
 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
                                  float_status *s);
@@ -XXX,XX +XXX,XX @@ floatx80_mul(floatx80 a, floatx80 b, float_status *status)
  * Fused multiply-add
  */
 
-float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
-                                    int flags, float_status *status)
+float16 QEMU_FLATTEN
+float16_muladd_scalbn(float16 a, float16 b, float16 c,
+                      int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
 
     float16_unpack_canonical(&pa, a, status);
     float16_unpack_canonical(&pb, b, status);
     float16_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
     return float16_round_pack_canonical(pr, status);
 }
 
-static float32 QEMU_SOFTFLOAT_ATTR
-soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
-                float_status *status)
+float16 float16_muladd(float16 a, float16 b, float16 c,
+                       int flags, float_status *status)
+{
+    return float16_muladd_scalbn(a, b, c, 0, flags, status);
+}
+
+float32 QEMU_SOFTFLOAT_ATTR
+float32_muladd_scalbn(float32 a, float32 b, float32 c,
+                      int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
 
     float32_unpack_canonical(&pa, a, status);
     float32_unpack_canonical(&pb, b, status);
     float32_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
     return float32_round_pack_canonical(pr, status);
 }
 
-static float64 QEMU_SOFTFLOAT_ATTR
-soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
-                float_status *status)
+float64 QEMU_SOFTFLOAT_ATTR
+float64_muladd_scalbn(float64 a, float64 b, float64 c,
+                      int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
 
     float64_unpack_canonical(&pa, a, status);
     float64_unpack_canonical(&pb, b, status);
     float64_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
     return float64_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     return ur.s;
 
  soft:
-    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
+    return float32_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
 }
 
 float64 QEMU_FLATTEN
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     return ur.s;
 
  soft:
-    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
+    return float64_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
 }
 
 float64 float64r32_muladd(float64 a, float64 b, float64 c,
@@ -XXX,XX +XXX,XX @@ float64 float64r32_muladd(float64 a, float64 b, float64 c,
     float64_unpack_canonical(&pa, a, status);
     float64_unpack_canonical(&pb, b, status);
     float64_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
     return float64r32_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
     bfloat16_unpack_canonical(&pa, a, status);
     bfloat16_unpack_canonical(&pb, b, status);
     bfloat16_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
     return bfloat16_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
     float128_unpack_canonical(&pa, a, status);
     float128_unpack_canonical(&pb, b, status);
     float128_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
     return float128_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ float32 float32_exp2(float32 a, float_status *status)
 
     float64_unpack_canonical(&rp, float64_one, status);
     for (i = 0 ; i < 15 ; i++) {
+
         float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status);
-        rp = *parts_muladd(&tp, &xnp, &rp, 0, status);
+        rp = *parts_muladd_scalbn(&tp, &xnp, &rp, 0, 0, status);
         xnp = *parts_mul(&xnp, &xp, status);
     }
 
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
  * Requires A and C extracted into a double-sized structure to provide the
  * extra space for the widening multiply.
  */
-static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
-                                   FloatPartsN *c, int flags, float_status *s)
+static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
+                                          FloatPartsN *c, int scale,
+                                          int flags, float_status *s)
 {
     int ab_mask, abc_mask;
     FloatPartsW p_widen, c_widen;
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
     a->exp = p_widen.exp;
 
  return_normal:
+    /* TODO: Replace all use of float_muladd_halve_result with scale. */
     if (flags & float_muladd_halve_result) {
         a->exp -= 1;
     }
+    a->exp += scale;
  finish_sign:
     if (flags & float_muladd_negate_result) {
         a->sign ^= 1;
-- 
2.43.0

Use the scalbn interface instead of float_muladd_halve_result.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/helper-a64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/helper-a64.c
+++ b/target/arm/tcg/helper-a64.c
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, float_status *fpst)
         (float16_is_infinity(b) && float16_is_zero(a))) {
         return float16_one_point_five;
     }
-    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
+    return float16_muladd_scalbn(a, b, float16_three, -1, 0, fpst);
 }
 
 float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
@@ -XXX,XX +XXX,XX @@ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
         (float32_is_infinity(b) && float32_is_zero(a))) {
         return float32_one_point_five;
     }
-    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
+    return float32_muladd_scalbn(a, b, float32_three, -1, 0, fpst);
 }
 
 float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
@@ -XXX,XX +XXX,XX @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
         (float64_is_infinity(b) && float64_is_zero(a))) {
         return float64_one_point_five;
     }
-    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
+    return float64_muladd_scalbn(a, b, float64_three, -1, 0, fpst);
 }
 
 /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
-- 
2.43.0

Use the scalbn interface instead of float_muladd_halve_result.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/helper.h     |  4 +-
 target/sparc/fop_helper.c |  8 ++--
 target/sparc/translate.c  | 80 +++++++++++++++++++++++----------------
 3 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/target/sparc/helper.h b/target/sparc/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/helper.h
+++ b/target/sparc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(faddd, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fsubd, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fdivd, TCG_CALL_NO_WG, f64, env, f64, f64)
-DEF_HELPER_FLAGS_5(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, i32)
+DEF_HELPER_FLAGS_6(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, s32, i32)
 DEF_HELPER_FLAGS_3(fnaddd, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fnmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
 
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(fadds, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fsubs, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fdivs, TCG_CALL_NO_WG, f32, env, f32, f32)
-DEF_HELPER_FLAGS_5(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, i32)
+DEF_HELPER_FLAGS_6(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, s32, i32)
 DEF_HELPER_FLAGS_3(fnadds, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fnmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
 
diff --git a/target/sparc/fop_helper.c b/target/sparc/fop_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/fop_helper.c
+++ b/target/sparc/fop_helper.c
@@ -XXX,XX +XXX,XX @@ Int128 helper_fsqrtq(CPUSPARCState *env, Int128 src)
 }
 
 float32 helper_fmadds(CPUSPARCState *env, float32 s1,
-                      float32 s2, float32 s3, uint32_t op)
+                      float32 s2, float32 s3, int32_t sc, uint32_t op)
 {
-    float32 ret = float32_muladd(s1, s2, s3, op, &env->fp_status);
+    float32 ret = float32_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
     check_ieee_exceptions(env, GETPC());
     return ret;
 }
 
 float64 helper_fmaddd(CPUSPARCState *env, float64 s1,
-                      float64 s2, float64 s3, uint32_t op)
+                      float64 s2, float64 s3, int32_t sc, uint32_t op)
 {
-    float64 ret = float64_muladd(s1, s2, s3, op, &env->fp_status);
+    float64 ret = float64_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
     check_ieee_exceptions(env, GETPC());
     return ret;
 }
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_op_fabsq(TCGv_i128 dst, TCGv_i128 src)
 
 static void gen_op_fmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
+    TCGv_i32 z = tcg_constant_i32(0);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, z);
 }
 
 static void gen_op_fmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
+    TCGv_i32 z = tcg_constant_i32(0);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, z);
 }
 
 static void gen_op_fmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    int op = float_muladd_negate_c;
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    int op = float_muladd_negate_c;
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    int op = float_muladd_negate_c | float_muladd_negate_result;
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
+                                   float_muladd_negate_result);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    int op = float_muladd_negate_c | float_muladd_negate_result;
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
+                                   float_muladd_negate_result);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    int op = float_muladd_negate_result;
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    int op = float_muladd_negate_result;
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
 }
 
 /* Use muladd to compute (1 * src1) + src2 / 2 with one rounding. */
 static void gen_op_fhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
 {
-    TCGv_i32 one = tcg_constant_i32(float32_one);
-    int op = float_muladd_halve_result;
-    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i32 fone = tcg_constant_i32(float32_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(0);
+    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
 {
-    TCGv_i64 one = tcg_constant_i64(float64_one);
-    int op = float_muladd_halve_result;
-    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i64 fone = tcg_constant_i64(float64_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(0);
+    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 /* Use muladd to compute (1 * src1) - src2 / 2 with one rounding. */
 static void gen_op_fhsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
 {
-    TCGv_i32 one = tcg_constant_i32(float32_one);
-    int op = float_muladd_negate_c | float_muladd_halve_result;
-    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i32 fone = tcg_constant_i32(float32_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fhsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
 {
-    TCGv_i64 one = tcg_constant_i64(float64_one);
-    int op = float_muladd_negate_c | float_muladd_halve_result;
-    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i64 fone = tcg_constant_i64(float64_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 /* Use muladd to compute -((1 * src1) + src2 / 2) with one rounding. */
 static void gen_op_fnhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
 {
-    TCGv_i32 one = tcg_constant_i32(float32_one);
-    int op = float_muladd_negate_result | float_muladd_halve_result;
-    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i32 fone = tcg_constant_i32(float32_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fnhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
 {
-    TCGv_i64 one = tcg_constant_i64(float64_one);
-    int op = float_muladd_negate_result | float_muladd_halve_result;
-    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i64 fone = tcg_constant_i64(float64_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fpexception_im(DisasContext *dc, int ftt)
-- 
2.43.0

All uses have been convered to float*_muladd_scalbn.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h   | 3 ---
 fpu/softfloat.c           | 6 ------
 fpu/softfloat-parts.c.inc | 4 ----
 3 files changed, 13 deletions(-)

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
 | Using these differs from negating an input or output before calling
 | the muladd function in that this means that a NaN doesn't have its
 | sign bit inverted before it is propagated.
-| We also support halving the result before rounding, as a special
-| case to support the ARM fused-sqrt-step instruction FRSQRTS.
 *----------------------------------------------------------------------------*/
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
     float_muladd_negate_result = 4,
-    float_muladd_halve_result = 8,
 };
 
 /*----------------------------------------------------------------------------
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
-    if (unlikely(flags & float_muladd_halve_result)) {
-        goto soft;
-    }
 
     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
-    if (unlikely(flags & float_muladd_halve_result)) {
-        goto soft;
-    }
 
     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
     a->exp = p_widen.exp;
 
  return_normal:
-    /* TODO: Replace all use of float_muladd_halve_result with scale. */
-    if (flags & float_muladd_halve_result) {
-        a->exp -= 1;
-    }
     a->exp += scale;
  finish_sign:
     if (flags & float_muladd_negate_result) {
-- 
2.43.0

This rounding mode is used by Hexagon.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-types.h | 2 ++
 fpu/softfloat-parts.c.inc     | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -XXX,XX +XXX,XX @@ typedef enum __attribute__((__packed__)) {
     float_round_to_odd       = 5,
     /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
     float_round_to_odd_inf   = 6,
+    /* Not an IEEE rounding mode: round to nearest even, overflow to max */
+    float_round_nearest_even_max = 7,
 } FloatRoundMode;
 
 /*
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
     int exp, flags = 0;
 
     switch (s->float_rounding_mode) {
+    case float_round_nearest_even_max:
+        overflow_norm = true;
+        /* fall through */
     case float_round_nearest_even:
         if (N > 64 && frac_lsb == 0) {
             inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
-- 
2.43.0

Certain Hexagon instructions suppress changes to the result
when the product of fma() is a true zero.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h   | 5 +++++
 fpu/softfloat.c           | 3 +++
 fpu/softfloat-parts.c.inc | 4 +++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
 | Using these differs from negating an input or output before calling
 | the muladd function in that this means that a NaN doesn't have its
 | sign bit inverted before it is propagated.
+|
+| With float_muladd_suppress_add_product_zero, if A or B is zero
+| such that the product is a true zero, then return C without addition.
+| This preserves the sign of C when C is +/- 0.  Used for Hexagon.
 *----------------------------------------------------------------------------*/
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
     float_muladd_negate_result = 4,
+    float_muladd_suppress_add_product_zero = 8,
 };
 
 /*----------------------------------------------------------------------------
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
+    if (unlikely(flags & float_muladd_suppress_add_product_zero)) {
+        goto soft;
+    }
 
     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
             goto return_normal;
         }
         if (c->cls == float_class_zero) {
-            if (a->sign != c->sign) {
+            if (flags & float_muladd_suppress_add_product_zero) {
+                a->sign = c->sign;
+            } else if (a->sign != c->sign) {
                 goto return_sub_zero;
             }
             goto return_zero;
-- 
2.43.0

There are no special cases for this instruction.
Remove internal_mpyf as unused.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.h   | 1 -
 target/hexagon/fma_emu.c   | 8 --------
 target/hexagon/op_helper.c | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.h
+++ b/target/hexagon/fma_emu.h
@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32);
 float32 infinite_float32(uint8_t sign);
 float32 internal_fmafx(float32 a, float32 b, float32 c,
                        int scale, float_status *fp_status);
-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status);
 float64 internal_mpyhh(float64 a, float64 b,
                        unsigned long long int accumulated,
                        float_status *fp_status);
diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
     return accum_round_float32(result, fp_status);
 }
 
-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status)
-{
-    if (float32_is_zero(a) || float32_is_zero(b)) {
-        return float32_mul(a, b, fp_status);
-    }
-    return internal_fmafx(a, b, float32_zero, 0, fp_status);
-}
-
 float64 internal_mpyhh(float64 a, float64 b,
                       unsigned long long int accumulated,
                       float_status *fp_status)
diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -XXX,XX +XXX,XX @@ float32 HELPER(sfmpy)(CPUHexagonState *env, float32 RsV, float32 RtV)
 {
     float32 RdV;
     arch_fpop_start(env);
-    RdV = internal_mpyf(RsV, RtV, &env->fp_status);
+    RdV = float32_mul(RsV, RtV, &env->fp_status);
     arch_fpop_end(env);
     return RdV;
 }
-- 
2.43.0

There are no special cases for this instruction.  Since hexagon
always uses default-nan mode, explicitly negating the first
input is unnecessary.  Use float_muladd_negate_product instead.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

This instruction has a special case that 0 * x + c returns c
without the normal sign folding that comes with 0 + -0.
Use the new float_muladd_suppress_add_product_zero to
describe this.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -XXX,XX +XXX,XX @@ static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
 float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
                          float32 RsV, float32 RtV, float32 PuV)
 {
-    size4s_t tmp;
     arch_fpop_start(env);
-    RxV = check_nan(RxV, RxV, &env->fp_status);
-    RxV = check_nan(RxV, RsV, &env->fp_status);
-    RxV = check_nan(RxV, RtV, &env->fp_status);
-    tmp = internal_fmafx(RsV, RtV, RxV, fSXTN(8, 64, PuV), &env->fp_status);
-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
-        RxV = tmp;
-    }
+    RxV = float32_muladd_scalbn(RsV, RtV, RxV, fSXTN(8, 64, PuV),
+                                float_muladd_suppress_add_product_zero,
+                                &env->fp_status);
     arch_fpop_end(env);
     return RxV;
 }
-- 
2.43.0

There are multiple special cases for this instruction.
(1) The saturate to normal maximum instead of overflow to infinity is
    handled by the new float_round_nearest_even_max rounding mode.
(2) The 0 * n + c special case is handled by the new
    float_muladd_suppress_add_product_zero flag.
(3) The Inf - Inf -> 0 special case can be detected after the fact
    by examining float_flag_invalid_isi.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 105 +++++++++----------------------------
 1 file changed, 26 insertions(+), 79 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV,
     return RxV;
 }
 
-static bool is_zero_prod(float32 a, float32 b)
-{
-    return ((float32_is_zero(a) && is_finite(b)) ||
-            (float32_is_zero(b) && is_finite(a)));
-}
-
-static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
-{
-    float32 ret = dst;
-    if (float32_is_any_nan(x)) {
-        if (extract32(x, 22, 1) == 0) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        ret = make_float32(0xffffffff);    /* nan */
-    }
-    return ret;
-}
-
 float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
                          float32 RsV, float32 RtV, float32 PuV)
 {
@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV,
     return RxV;
 }
 
-static bool is_inf_prod(int32_t a, int32_t b)
+static float32 do_sffma_lib(CPUHexagonState *env, float32 RxV,
+                            float32 RsV, float32 RtV, int negate)
 {
-    return (float32_is_infinity(a) && float32_is_infinity(b)) ||
-           (float32_is_infinity(a) && is_finite(b) && !float32_is_zero(b)) ||
-           (float32_is_infinity(b) && is_finite(a) && !float32_is_zero(a));
+    int flags;
+
+    arch_fpop_start(env);
+
+    set_float_rounding_mode(float_round_nearest_even_max, &env->fp_status);
+    RxV = float32_muladd(RsV, RtV, RxV,
+                         negate | float_muladd_suppress_add_product_zero,
+                         &env->fp_status);
+
+    flags = get_float_exception_flags(&env->fp_status);
+    if (flags) {
+        /* Flags are suppressed by this instruction. */
+        set_float_exception_flags(0, &env->fp_status);
+
+        /* Return 0 for Inf - Inf. */
+        if (flags & float_flag_invalid_isi) {
+            RxV = 0;
+        }
+    }
+
+    arch_fpop_end(env);
+    return RxV;
 }
 
 float32 HELPER(sffma_lib)(CPUHexagonState *env, float32 RxV,
                           float32 RsV, float32 RtV)
 {
-    bool infinp;
-    bool infminusinf;
-    float32 tmp;
-
-    arch_fpop_start(env);
-    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
-    infminusinf = float32_is_infinity(RxV) &&
-                  is_inf_prod(RsV, RtV) &&
-                  (fGETBIT(31, RsV ^ RxV ^ RtV) != 0);
-    infinp = float32_is_infinity(RxV) ||
-             float32_is_infinity(RtV) ||
-             float32_is_infinity(RsV);
-    RxV = check_nan(RxV, RxV, &env->fp_status);
-    RxV = check_nan(RxV, RsV, &env->fp_status);
-    RxV = check_nan(RxV, RtV, &env->fp_status);
-    tmp = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status);
-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
-        RxV = tmp;
-    }
-    set_float_exception_flags(0, &env->fp_status);
-    if (float32_is_infinity(RxV) && !infinp) {
-        RxV = RxV - 1;
-    }
-    if (infminusinf) {
-        RxV = 0;
-    }
-    arch_fpop_end(env);
-    return RxV;
+    return do_sffma_lib(env, RxV, RsV, RtV, 0);
 }
 
 float32 HELPER(sffms_lib)(CPUHexagonState *env, float32 RxV,
                           float32 RsV, float32 RtV)
 {
-    bool infinp;
-    bool infminusinf;
-    float32 tmp;
-
-    arch_fpop_start(env);
-    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
-    infminusinf = float32_is_infinity(RxV) &&
-                  is_inf_prod(RsV, RtV) &&
-                  (fGETBIT(31, RsV ^ RxV ^ RtV) == 0);
-    infinp = float32_is_infinity(RxV) ||
-             float32_is_infinity(RtV) ||
-             float32_is_infinity(RsV);
-    RxV = check_nan(RxV, RxV, &env->fp_status);
-    RxV = check_nan(RxV, RsV, &env->fp_status);
-    RxV = check_nan(RxV, RtV, &env->fp_status);
-    float32 minus_RsV = float32_sub(float32_zero, RsV, &env->fp_status);
-    tmp = internal_fmafx(minus_RsV, RtV, RxV, 0, &env->fp_status);
-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
-        RxV = tmp;
-    }
-    set_float_exception_flags(0, &env->fp_status);
-    if (float32_is_infinity(RxV) && !infinp) {
-        RxV = RxV - 1;
-    }
-    if (infminusinf) {
-        RxV = 0;
-    }
-    arch_fpop_end(env);
-    return RxV;
+    return do_sffma_lib(env, RxV, RsV, RtV, float_muladd_negate_product);
 }
 
 float64 HELPER(dfmpyfix)(CPUHexagonState *env, float64 RssV, float64 RttV)
-- 
2.43.0

The function is now unused.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.h |   2 -
 target/hexagon/fma_emu.c | 171 ---------------------------------------
 2 files changed, 173 deletions(-)

diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.h
+++ b/target/hexagon/fma_emu.h
@@ -XXX,XX +XXX,XX @@ static inline uint32_t float32_getexp_raw(float32 f32)
 }
 int32_t float32_getexp(float32 f32);
 float32 infinite_float32(uint8_t sign);
-float32 internal_fmafx(float32 a, float32 b, float32 c,
-                       int scale, float_status *fp_status);
 float64 internal_mpyhh(float64 a, float64 b,
                        unsigned long long int accumulated,
                        float_status *fp_status);
diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
     return -1;
 }
 
-static uint64_t float32_getmant(float32 f32)
-{
-    Float a = { .i = f32 };
-    if (float32_is_normal(f32)) {
-        return a.mant | 1ULL << 23;
-    }
-    if (float32_is_zero(f32)) {
-        return 0;
-    }
-    if (float32_is_denormal(f32)) {
-        return a.mant;
-    }
-    return ~0ULL;
-}
-
 int32_t float32_getexp(float32 f32)
 {
     Float a = { .i = f32 };
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
 }
 
 /* Return a maximum finite value with the requested sign */
-static float32 maxfinite_float32(uint8_t sign)
-{
-    if (sign) {
-        return make_float32(SF_MINUS_MAXF);
-    } else {
-        return make_float32(SF_MAXF);
-    }
-}
-
-/* Return a zero value with requested sign */
-static float32 zero_float32(uint8_t sign)
-{
-    if (sign) {
-        return make_float32(0x80000000);
-    } else {
-        return float32_zero;
-    }
-}
-
 #define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
 static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
 { \
@@ -XXX,XX +XXX,XX @@ static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
 }
 
 GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
-GEN_XF_ROUND(float32, SF_MANTBITS, SF_INF_EXP, Float)
-
-static bool is_inf_prod(float64 a, float64 b)
-{
-    return ((float64_is_infinity(a) && float64_is_infinity(b)) ||
-            (float64_is_infinity(a) && is_finite(b) && (!float64_is_zero(b))) ||
-            (float64_is_infinity(b) && is_finite(a) && (!float64_is_zero(a))));
-}
-
-static float64 special_fma(float64 a, float64 b, float64 c,
-                           float_status *fp_status)
-{
-    float64 ret = make_float64(0);
-
-    /*
-     * If A multiplied by B is an exact infinity and C is also an infinity
-     * but with the opposite sign, FMA returns NaN and raises invalid.
-     */
-    uint8_t a_sign = float64_is_neg(a);
-    uint8_t b_sign = float64_is_neg(b);
-    uint8_t c_sign = float64_is_neg(c);
-    if (is_inf_prod(a, b) && float64_is_infinity(c)) {
-        if ((a_sign ^ b_sign) != c_sign) {
-            ret = make_float64(DF_NAN);
-            float_raise(float_flag_invalid, fp_status);
-            return ret;
-        }
-    }
-    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
-        (float64_is_zero(a) && float64_is_infinity(b))) {
-        ret = make_float64(DF_NAN);
-        float_raise(float_flag_invalid, fp_status);
-        return ret;
-    }
-    /*
-     * If none of the above checks are true and C is a NaN,
-     * a NaN shall be returned
-     * If A or B are NaN, a NAN shall be returned.
-     */
-    if (float64_is_any_nan(a) ||
-        float64_is_any_nan(b) ||
-        float64_is_any_nan(c)) {
-        if (float64_is_any_nan(a) && (fGETBIT(51, a) == 0)) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        if (float64_is_any_nan(b) && (fGETBIT(51, b) == 0)) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        if (float64_is_any_nan(c) && (fGETBIT(51, c) == 0)) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        ret = make_float64(DF_NAN);
-        return ret;
-    }
-    /*
-     * We have checked for adding opposite-signed infinities.
-     * Other infinities return infinity with the correct sign
-     */
-    if (float64_is_infinity(c)) {
-        ret = infinite_float64(c_sign);
-        return ret;
-    }
-    if (float64_is_infinity(a) || float64_is_infinity(b)) {
-        ret = infinite_float64(a_sign ^ b_sign);
-        return ret;
-    }
-    g_assert_not_reached();
-}
-
-static float32 special_fmaf(float32 a, float32 b, float32 c,
-                            float_status *fp_status)
-{
-    float64 aa, bb, cc;
-    aa = float32_to_float64(a, fp_status);
-    bb = float32_to_float64(b, fp_status);
-    cc = float32_to_float64(c, fp_status);
-    return float64_to_float32(special_fma(aa, bb, cc, fp_status), fp_status);
-}
-
-float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
-                       float_status *fp_status)
-{
-    Accum prod;
-    Accum acc;
-    Accum result;
-    accum_init(&prod);
-    accum_init(&acc);
-    accum_init(&result);
-
-    uint8_t a_sign = float32_is_neg(a);
-    uint8_t b_sign = float32_is_neg(b);
-    uint8_t c_sign = float32_is_neg(c);
-    if (float32_is_infinity(a) ||
-        float32_is_infinity(b) ||
-        float32_is_infinity(c)) {
-        return special_fmaf(a, b, c, fp_status);
-    }
-    if (float32_is_any_nan(a) ||
-        float32_is_any_nan(b) ||
-        float32_is_any_nan(c)) {
-        return special_fmaf(a, b, c, fp_status);
-    }
-    if ((scale == 0) && (float32_is_zero(a) || float32_is_zero(b))) {
-        float32 tmp = float32_mul(a, b, fp_status);
-        tmp = float32_add(tmp, c, fp_status);
-        return tmp;
-    }
-
-    /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
-    prod.mant = int128_mul_6464(float32_getmant(a), float32_getmant(b));
-
-    /*
-     * Note: extracting the mantissa into an int is multiplying by
-     * 2**23, so adjust here
-     */
-    prod.exp = float32_getexp(a) + float32_getexp(b) - SF_BIAS - 23;
-    prod.sign = a_sign ^ b_sign;
-    if (float32_is_zero(a) || float32_is_zero(b)) {
-        prod.exp = -2 * WAY_BIG_EXP;
-    }
-    if ((scale > 0) && float32_is_denormal(c)) {
-        acc.mant = int128_mul_6464(0, 0);
-        acc.exp = -WAY_BIG_EXP;
-        acc.sign = c_sign;
-        acc.sticky = 1;
-        result = accum_add(prod, acc);
-    } else if (!float32_is_zero(c)) {
-        acc.mant = int128_mul_6464(float32_getmant(c), 1);
-        acc.exp = float32_getexp(c);
-        acc.sign = c_sign;
-        result = accum_add(prod, acc);
-    } else {
-        result = prod;
-    }
-    result.exp += scale;
-    return accum_round_float32(result, fp_status);
-}
 
 float64 internal_mpyhh(float64 a, float64 b,
                       unsigned long long int accumulated,
-- 
2.43.0

This massive macro is now only used once.
Expand it for use only by float64.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 255 +++++++++++++++++++--------------------
 1 file changed, 127 insertions(+), 128 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
 }
 
 /* Return a maximum finite value with the requested sign */
-#define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
-static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
-{ \
-    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0) \
-        && ((a.guard | a.round | a.sticky) == 0)) { \
-        /* result zero */ \
-        switch (fp_status->float_rounding_mode) { \
-        case float_round_down: \
-            return zero_##SUFFIX(1); \
-        default: \
-            return zero_##SUFFIX(0); \
-        } \
-    } \
-    /* Normalize right */ \
-    /* We want MANTBITS bits of mantissa plus the leading one. */ \
-    /* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
-    /* So we need to normalize right while the high word is non-zero and \
-    * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
-    while ((int128_gethi(a.mant) != 0) || \
-           ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0)) { \
-        a = accum_norm_right(a, 1); \
-    } \
-    /* \
-     * OK, now normalize left \
-     * We want to normalize left until we have a leading one in bit 24 \
-     * Theoretically, we only need to shift a maximum of one to the left if we \
-     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky \
-     * should be 0  \
-     */ \
-    while ((int128_getlo(a.mant) & (1ULL << MANTBITS)) == 0) { \
-        a = accum_norm_left(a); \
-    } \
-    /* \
-     * OK, now we might need to denormalize because of potential underflow. \
-     * We need to do this before rounding, and rounding might make us normal \
-     * again \
-     */ \
-    while (a.exp <= 0) { \
-        a = accum_norm_right(a, 1 - a.exp); \
-        /* \
-         * Do we have underflow? \
-         * That's when we get an inexact answer because we ran out of bits \
-         * in a denormal. \
-         */ \
-        if (a.guard || a.round || a.sticky) { \
-            float_raise(float_flag_underflow, fp_status); \
-        } \
-    } \
-    /* OK, we're relatively canonical... now we need to round */ \
-    if (a.guard || a.round || a.sticky) { \
-        float_raise(float_flag_inexact, fp_status); \
-        switch (fp_status->float_rounding_mode) { \
-        case float_round_to_zero: \
-            /* Chop and we're done */ \
-            break; \
-        case float_round_up: \
-            if (a.sign == 0) { \
-                a.mant = int128_add(a.mant, int128_one()); \
-            } \
-            break; \
-        case float_round_down: \
-            if (a.sign != 0) { \
-                a.mant = int128_add(a.mant, int128_one()); \
-            } \
-            break; \
-        default: \
-            if (a.round || a.sticky) { \
-                /* round up if guard is 1, down if guard is zero */ \
-                a.mant = int128_add(a.mant, int128_make64(a.guard)); \
-            } else if (a.guard) { \
-                /* exactly .5, round up if odd */ \
-                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one())); \
-            } \
-            break; \
-        } \
-    } \
-    /* \
-     * OK, now we might have carried all the way up. \
-     * So we might need to shr once \
-     * at least we know that the lsb should be zero if we rounded and \
-     * got a carry out... \
-     */ \
-    if ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0) { \
-        a = accum_norm_right(a, 1); \
-    } \
-    /* Overflow? */ \
-    if (a.exp >= INF_EXP) { \
-        /* Yep, inf result */ \
-        float_raise(float_flag_overflow, fp_status); \
-        float_raise(float_flag_inexact, fp_status); \
-        switch (fp_status->float_rounding_mode) { \
-        case float_round_to_zero: \
-            return maxfinite_##SUFFIX(a.sign); \
-        case float_round_up: \
-            if (a.sign == 0) { \
-                return infinite_##SUFFIX(a.sign); \
-            } else { \
-                return maxfinite_##SUFFIX(a.sign); \
-            } \
-        case float_round_down: \
-            if (a.sign != 0) { \
-                return infinite_##SUFFIX(a.sign); \
-            } else { \
-                return maxfinite_##SUFFIX(a.sign); \
-            } \
-        default: \
-            return infinite_##SUFFIX(a.sign); \
-        } \
-    } \
-    /* Underflow? */ \
-    if (int128_getlo(a.mant) & (1ULL << MANTBITS)) { \
-        /* Leading one means: No, we're normal. So, we should be done... */ \
-        INTERNAL_TYPE ret; \
-        ret.i = 0; \
-        ret.sign = a.sign; \
-        ret.exp = a.exp; \
-        ret.mant = int128_getlo(a.mant); \
-        return ret.i; \
-    } \
-    assert(a.exp == 1); \
-    INTERNAL_TYPE ret; \
-    ret.i = 0; \
-    ret.sign = a.sign; \
-    ret.exp = 0; \
-    ret.mant = int128_getlo(a.mant); \
-    return ret.i; \
+static float64 accum_round_float64(Accum a, float_status *fp_status)
+{
+    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
+        && ((a.guard | a.round | a.sticky) == 0)) {
+        /* result zero */
+        switch (fp_status->float_rounding_mode) {
+        case float_round_down:
+            return zero_float64(1);
+        default:
+            return zero_float64(0);
+        }
+    }
+    /*
+     * Normalize right
+     * We want DF_MANTBITS bits of mantissa plus the leading one.
+     * That means that we want DF_MANTBITS+1 bits, or 0x000000000000FF_FFFF
+     * So we need to normalize right while the high word is non-zero and
+     * while the low word is nonzero when masked with 0xffe0_0000_0000_0000
+     */
+    while ((int128_gethi(a.mant) != 0) ||
+           ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0)) {
+        a = accum_norm_right(a, 1);
+    }
+    /*
+     * OK, now normalize left
+     * We want to normalize left until we have a leading one in bit 24
+     * Theoretically, we only need to shift a maximum of one to the left if we
+     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky
+     * should be 0
+     */
+    while ((int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) == 0) {
+        a = accum_norm_left(a);
+    }
+    /*
+     * OK, now we might need to denormalize because of potential underflow.
+     * We need to do this before rounding, and rounding might make us normal
+     * again
+     */
+    while (a.exp <= 0) {
+        a = accum_norm_right(a, 1 - a.exp);
+        /*
+         * Do we have underflow?
+         * That's when we get an inexact answer because we ran out of bits
+         * in a denormal.
+         */
+        if (a.guard || a.round || a.sticky) {
+            float_raise(float_flag_underflow, fp_status);
+        }
+    }
+    /* OK, we're relatively canonical... now we need to round */
+    if (a.guard || a.round || a.sticky) {
+        float_raise(float_flag_inexact, fp_status);
+        switch (fp_status->float_rounding_mode) {
+        case float_round_to_zero:
+            /* Chop and we're done */
+            break;
+        case float_round_up:
+            if (a.sign == 0) {
+                a.mant = int128_add(a.mant, int128_one());
+            }
+            break;
+        case float_round_down:
+            if (a.sign != 0) {
+                a.mant = int128_add(a.mant, int128_one());
+            }
+            break;
+        default:
+            if (a.round || a.sticky) {
+                /* round up if guard is 1, down if guard is zero */
+                a.mant = int128_add(a.mant, int128_make64(a.guard));
+            } else if (a.guard) {
+                /* exactly .5, round up if odd */
+                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one()));
+            }
+            break;
+        }
+    }
+    /*
+     * OK, now we might have carried all the way up.
+     * So we might need to shr once
+     * at least we know that the lsb should be zero if we rounded and
+     * got a carry out...
+     */
+    if ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0) {
+        a = accum_norm_right(a, 1);
+    }
+    /* Overflow? */
+    if (a.exp >= DF_INF_EXP) {
+        /* Yep, inf result */
+        float_raise(float_flag_overflow, fp_status);
+        float_raise(float_flag_inexact, fp_status);
+        switch (fp_status->float_rounding_mode) {
+        case float_round_to_zero:
+            return maxfinite_float64(a.sign);
+        case float_round_up:
+            if (a.sign == 0) {
+                return infinite_float64(a.sign);
+            } else {
+                return maxfinite_float64(a.sign);
+            }
+        case float_round_down:
+            if (a.sign != 0) {
+                return infinite_float64(a.sign);
+            } else {
+                return maxfinite_float64(a.sign);
+            }
+        default:
+            return infinite_float64(a.sign);
+        }
+    }
+    /* Underflow? */
+    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
+        /* Leading one means: No, we're normal. So, we should be done... */
+        Double ret;
+        ret.i = 0;
+        ret.sign = a.sign;
+        ret.exp = a.exp;
+        ret.mant = int128_getlo(a.mant);
+        return ret.i;
+    }
+    assert(a.exp == 1);
+    Double ret;
+    ret.i = 0;
+    ret.sign = a.sign;
+    ret.exp = 0;
+    ret.mant = int128_getlo(a.mant);
+    return ret.i;
 }
 
-GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
-
 float64 internal_mpyhh(float64 a, float64 b,
                       unsigned long long int accumulated,
                       float_status *fp_status)
-- 
2.43.0

This structure, with bitfields, is incorrect for big-endian.
Use the existing float32_getexp_raw which uses extract32.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ typedef union {
     };
 } Double;
 
-typedef union {
-    float f;
-    uint32_t i;
-    struct {
-        uint32_t mant:23;
-        uint32_t exp:8;
-        uint32_t sign:1;
-    };
-} Float;
-
 static uint64_t float64_getmant(float64 f64)
 {
     Double a = { .i = f64 };
@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
 
 int32_t float32_getexp(float32 f32)
 {
-    Float a = { .i = f32 };
+    int exp = float32_getexp_raw(f32);
     if (float32_is_normal(f32)) {
-        return a.exp;
+        return exp;
     }
     if (float32_is_denormal(f32)) {
-        return a.exp + 1;
+        return exp + 1;
     }
     return -1;
 }
-- 
2.43.0

This structure, with bitfields, is incorrect for big-endian.
Use extract64 and deposit64 instead.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 46 ++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@
 
 #define WAY_BIG_EXP 4096
 
-typedef union {
-    double f;
-    uint64_t i;
-    struct {
-        uint64_t mant:52;
-        uint64_t exp:11;
-        uint64_t sign:1;
-    };
-} Double;
-
 static uint64_t float64_getmant(float64 f64)
 {
-    Double a = { .i = f64 };
+    uint64_t mant = extract64(f64, 0, 52);
     if (float64_is_normal(f64)) {
-        return a.mant | 1ULL << 52;
+        return mant | 1ULL << 52;
     }
     if (float64_is_zero(f64)) {
         return 0;
     }
     if (float64_is_denormal(f64)) {
-        return a.mant;
+        return mant;
     }
     return ~0ULL;
 }
 
 int32_t float64_getexp(float64 f64)
 {
-    Double a = { .i = f64 };
+    int exp = extract64(f64, 52, 11);
     if (float64_is_normal(f64)) {
-        return a.exp;
+        return exp;
     }
     if (float64_is_denormal(f64)) {
-        return a.exp + 1;
+        return exp + 1;
     }
     return -1;
 }
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
 /* Return a maximum finite value with the requested sign */
 static float64 accum_round_float64(Accum a, float_status *fp_status)
 {
+    uint64_t ret;
+
     if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
         && ((a.guard | a.round | a.sticky) == 0)) {
         /* result zero */
@@ -XXX,XX +XXX,XX @@ static float64 accum_round_float64(Accum a, float_status *fp_status)
         }
     }
     /* Underflow? */
-    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
+    ret = int128_getlo(a.mant);
+    if (ret & (1ULL << DF_MANTBITS)) {
         /* Leading one means: No, we're normal. So, we should be done... */
-        Double ret;
-        ret.i = 0;
-        ret.sign = a.sign;
-        ret.exp = a.exp;
-        ret.mant = int128_getlo(a.mant);
-        return ret.i;
+        ret = deposit64(ret, 52, 11, a.exp);
+    } else {
+        assert(a.exp == 1);
+        ret = deposit64(ret, 52, 11, 0);
     }
-    assert(a.exp == 1);
-    Double ret;
-    ret.i = 0;
-    ret.sign = a.sign;
-    ret.exp = 0;
-    ret.mant = int128_getlo(a.mant);
-    return ret.i;
+    ret = deposit64(ret, 63, 1, a.sign);
+    return ret;
 }
 
 float64 internal_mpyhh(float64 a, float64 b,
-- 
2.43.0

No need to open-code 64x64->128-bit multiplication.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 32 +++-----------------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32)
     return -1;
 }
 
-static uint32_t int128_getw0(Int128 x)
-{
-    return int128_getlo(x);
-}
-
-static uint32_t int128_getw1(Int128 x)
-{
-    return int128_getlo(x) >> 32;
-}
-
 static Int128 int128_mul_6464(uint64_t ai, uint64_t bi)
 {
-    Int128 a, b;
-    uint64_t pp0, pp1a, pp1b, pp1s, pp2;
+    uint64_t l, h;
 
-    a = int128_make64(ai);
-    b = int128_make64(bi);
-    pp0 = (uint64_t)int128_getw0(a) * (uint64_t)int128_getw0(b);
-    pp1a = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw0(b);
-    pp1b = (uint64_t)int128_getw1(b) * (uint64_t)int128_getw0(a);
-    pp2 = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw1(b);
-
-    pp1s = pp1a + pp1b;
-    if ((pp1s < pp1a) || (pp1s < pp1b)) {
-        pp2 += (1ULL << 32);
-    }
-    uint64_t ret_low = pp0 + (pp1s << 32);
-    if ((ret_low < pp0) || (ret_low < (pp1s << 32))) {
-        pp2 += 1;
-    }
-
-    return int128_make128(ret_low, pp2 + (pp1s >> 32));
+    mulu64(&l, &h, ai, bi);
+    return int128_make128(l, h);
 }
 
 static Int128 int128_sub_borrow(Int128 a, Int128 b, int borrow)
-- 
2.43.0

Convert all targets simultaneously, as the gen_intermediate_code
function disappears from the target.  While there are possible
workarounds, they're larger than simply performing the conversion.

diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/bswap.h"
 #include "exec/vaddr.h"
 
-/**
- * gen_intermediate_code
- * @cpu: cpu context
- * @tb: translation block
- * @max_insns: max number of instructions to translate
- * @pc: guest virtual program counter address
- * @host_pc: host physical program counter address
- *
- * This function must be provided by the target, which should create
- * the target-specific DisasContext, and then invoke translator_loop.
- */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc);
-
 /**
  * DisasJumpType:
  * @DISAS_NEXT: Next instruction in program order.
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@ struct TCGCPUOps {
      * Called when the first CPU is realized.
      */
     void (*initialize)(void);
+    /**
+     * @translate_code: Translate guest instructions to TCGOps
+     * @cpu: cpu context
+     * @tb: translation block
+     * @max_insns: max number of instructions to translate
+     * @pc: guest virtual program counter address
+     * @host_pc: host physical program counter address
+     *
+     * This function must be provided by the target, which should create
+     * the target-specific DisasContext, and then invoke translator_loop.
+     */
+    void (*translate_code)(CPUState *cpu, TranslationBlock *tb,
+                           int *max_insns, vaddr pc, void *host_pc);
     /**
      * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
      *
diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -XXX,XX +XXX,XX @@ enum {
 };
 
 void alpha_translate_init(void);
+void alpha_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
 
 #define CPU_RESOLVING_TYPE TYPE_ALPHA_CPU
 
diff --git a/target/arm/internals.h b/target/arm/internals.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ void init_cpreg_list(ARMCPU *cpu);
 
 void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
 void arm_translate_init(void);
+void arm_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 void arm_cpu_register_gdb_commands(ARMCPU *cpu);
 void aarch64_cpu_register_gdb_commands(ARMCPU *cpu, GString *,
diff --git a/target/avr/cpu.h b/target/avr/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.h
+++ b/target/avr/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void set_avr_feature(CPUAVRState *env, int feature)
 }
 
 void avr_cpu_tcg_init(void);
+void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc);
 
 int cpu_avr_exec(CPUState *cpu);
 
diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.h
+++ b/target/hexagon/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc,
 typedef HexagonCPU ArchCPU;
 
 void hexagon_translate_init(void);
+void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc);
 
 #include "exec/cpu-all.h"
 
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline int HPPA_BTLB_ENTRIES(CPUHPPAState *env)
 }
 
 void hppa_translate_init(void);
+void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc);
 
 #define CPU_RESOLVING_TYPE TYPE_HPPA_CPU
 
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -XXX,XX +XXX,XX @@ static inline target_long lshift(target_long x, int n)
 
 /* translate.c */
 void tcg_x86_init(void);
+void x86_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 /* excp_helper.c */
 G_NORETURN void raise_exception(CPUX86State *env, int exception_index);
diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/internals.h
+++ b/target/loongarch/internals.h
@@ -XXX,XX +XXX,XX @@
 #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS)
 
 void loongarch_translate_init(void);
+void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
+                              int *max_insns, vaddr pc, void *host_pc);
 
 void G_NORETURN do_raise_exception(CPULoongArchState *env,
                                    uint32_t exception,
diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -XXX,XX +XXX,XX @@ int m68k_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int m68k_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 
 void m68k_tcg_init(void);
+void m68k_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc);
 void m68k_cpu_init_gdb(M68kCPU *cpu);
 uint32_t cpu_m68k_get_ccr(CPUM68KState *env);
 void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t);
diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.h
+++ b/target/microblaze/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void mb_cpu_write_msr(CPUMBState *env, uint32_t val)
 }
 
 void mb_tcg_init(void);
+void mb_translate_code(CPUState *cs, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc);
 
 #define CPU_RESOLVING_TYPE TYPE_MICROBLAZE_CPU
 
diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/tcg-internal.h
+++ b/target/mips/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 
 void mips_tcg_init(void);
+void mips_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc);
 
 void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb);
 G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.h
+++ b/target/openrisc/cpu.h
@@ -XXX,XX +XXX,XX @@ void openrisc_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
 int openrisc_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int openrisc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 void openrisc_translate_init(void);
+void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
+                             int *max_insns, vaddr pc, void *host_pc);
 int print_insn_or1k(bfd_vma addr, disassemble_info *info);
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ extern const VMStateDescription vmstate_ppc_cpu;
 
 /*****************************************************************************/
 void ppc_translate_init(void);
+void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 #if !defined(CONFIG_USER_ONLY)
 void ppc_store_sdr1(CPUPPCState *env, target_ulong value);
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -XXX,XX +XXX,XX @@ RISCVException smstateen_acc_ok(CPURISCVState *env, int index, uint64_t bit);
 void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv, bool virt_en);
 
 void riscv_translate_init(void);
+void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
+
 G_NORETURN void riscv_raise_exception(CPURISCVState *env,
                                       uint32_t exception, uintptr_t pc);
 
diff --git a/target/rx/cpu.h b/target/rx/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.h
+++ b/target/rx/cpu.h
@@ -XXX,XX +XXX,XX @@ int rx_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int rx_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 
 void rx_translate_init(void);
+void rx_translate_code(CPUState *cs, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc);
 void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte);
 
 #include "exec/cpu-all.h"
diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/s390x-internal.h
+++ b/target/s390x/s390x-internal.h
@@ -XXX,XX +XXX,XX @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
 
 /* translate.c */
 void s390x_translate_init(void);
+void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
 void s390x_restore_state_to_opc(CPUState *cs,
                                 const TranslationBlock *tb,
                                 const uint64_t *data);
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                                uintptr_t retaddr);
 
 void sh4_translate_init(void);
+void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 #if !defined(CONFIG_USER_ONLY)
 hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -XXX,XX +XXX,XX @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
 
 /* translate.c */
 void sparc_tcg_init(void);
+void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
 
 /* fop_helper.c */
 target_ulong cpu_get_fsr(CPUSPARCState *);
diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.h
+++ b/target/tricore/cpu.h
@@ -XXX,XX +XXX,XX @@ FIELD(TB_FLAGS, PRIV, 0, 2)
 
 void cpu_state_reset(CPUTriCoreState *s);
 void tricore_tcg_init(void);
+void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc);
 
 static inline void cpu_get_tb_cpu_state(CPUTriCoreState *env, vaddr *pc,
                                         uint64_t *cs_base, uint32_t *flags)
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void xtensa_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
 
 void xtensa_collect_sr_names(const XtensaConfig *config);
 void xtensa_translate_init(void);
+void xtensa_translate_code(CPUState *cs, TranslationBlock *tb,
+                           int *max_insns, vaddr pc, void *host_pc);
 void **xtensa_get_regfile_by_name(const char *name, int entries, int bits);
 void xtensa_breakpoint_handler(CPUState *cs);
 void xtensa_register_core(XtensaConfigList *node);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp)
 
     if (!tcg_target_initialized) {
         /* Check mandatory TCGCPUOps handlers */
+        const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops;
 #ifndef CONFIG_USER_ONLY
-        assert(cpu->cc->tcg_ops->cpu_exec_halt);
-        assert(cpu->cc->tcg_ops->cpu_exec_interrupt);
+        assert(tcg_ops->cpu_exec_halt);
+        assert(tcg_ops->cpu_exec_interrupt);
 #endif /* !CONFIG_USER_ONLY */
-        cpu->cc->tcg_ops->initialize();
+        assert(tcg_ops->translate_code);
+        tcg_ops->initialize();
         tcg_target_initialized = true;
     }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
 
     tcg_func_start(tcg_ctx);
 
-    tcg_ctx->cpu = env_cpu(env);
-    gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
+    CPUState *cs = env_cpu(env);
+    tcg_ctx->cpu = cs;
+    cs->cc->tcg_ops->translate_code(cs, tb, max_insns, pc, host_pc);
+
     assert(tb->size != 0);
     tcg_ctx->cpu = NULL;
     *max_insns = tb->icount;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
             /*
              * Overflow of code_gen_buffer, or the current slice of it.
              *
-             * TODO: We don't need to re-do gen_intermediate_code, nor
+             * TODO: We don't need to re-do tcg_ops->translate_code, nor
              * should we re-do the tcg optimization currently hidden
              * inside tcg_gen_code.  All that should be required is to
              * flush the TBs, allocate a new TB, re-initialize it per
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
 
 static const TCGCPUOps alpha_tcg_ops = {
     .initialize = alpha_translate_init,
+    .translate_code = alpha_translate_code,
     .synchronize_from_tb = alpha_cpu_synchronize_from_tb,
     .restore_state_to_opc = alpha_restore_state_to_opc,
 
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps alpha_tr_ops = {
     .tb_stop            = alpha_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void alpha_translate_code(CPUState *cpu, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base);
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps arm_sysemu_ops = {
 #ifdef CONFIG_TCG
 static const TCGCPUOps arm_tcg_ops = {
     .initialize = arm_translate_init,
+    .translate_code = arm_translate_code,
     .synchronize_from_tb = arm_cpu_synchronize_from_tb,
     .debug_excp_handler = arm_debug_excp_handler,
     .restore_state_to_opc = arm_restore_state_to_opc,
diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/cpu-v7m.c
+++ b/target/arm/tcg/cpu-v7m.c
@@ -XXX,XX +XXX,XX @@ static void cortex_m55_initfn(Object *obj)
 
 static const TCGCPUOps arm_v7m_tcg_ops = {
     .initialize = arm_translate_init,
+    .translate_code = arm_translate_code,
     .synchronize_from_tb = arm_cpu_synchronize_from_tb,
     .debug_excp_handler = arm_debug_excp_handler,
     .restore_state_to_opc = arm_restore_state_to_opc,
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps thumb_translator_ops = {
     .tb_stop            = arm_tr_tb_stop,
 };
 
-/* generate intermediate code for basic block 'tb'.  */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void arm_translate_code(CPUState *cpu, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = { };
     const TranslatorOps *ops = &arm_translator_ops;
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps avr_sysemu_ops = {
 
 static const TCGCPUOps avr_tcg_ops = {
     .initialize = avr_cpu_tcg_init,
+    .translate_code = avr_cpu_translate_code,
     .synchronize_from_tb = avr_cpu_synchronize_from_tb,
     .restore_state_to_opc = avr_restore_state_to_opc,
     .cpu_exec_interrupt = avr_cpu_exec_interrupt,
diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@ static bool trans_WDR(DisasContext *ctx, arg_WDR *a)
  *
  *    - translate()
  *    - canonicalize_skip()
- *    - gen_intermediate_code()
+ *    - translate_code()
  *    - restore_state_to_opc()
  *
  */
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps avr_tr_ops = {
     .tb_stop            = avr_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = { };
     translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base);
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_init(Object *obj)
 
 static const TCGCPUOps hexagon_tcg_ops = {
     .initialize = hexagon_translate_init,
+    .translate_code = hexagon_translate_code,
     .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
     .restore_state_to_opc = hexagon_restore_state_to_opc,
 };
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hexagon_tr_ops = {
     .tb_stop            = hexagon_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps hppa_sysemu_ops = {
 
 static const TCGCPUOps hppa_tcg_ops = {
     .initialize = hppa_translate_init,
+    .translate_code = hppa_translate_code,
     .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
     .restore_state_to_opc = hppa_restore_state_to_opc,
 
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hppa_tr_ops = {
 #endif
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx = { };
     translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base);
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static bool x86_debug_check_breakpoint(CPUState *cs)
 
 static const TCGCPUOps x86_tcg_ops = {
     .initialize = tcg_x86_init,
+    .translate_code = x86_translate_code,
     .synchronize_from_tb = x86_cpu_synchronize_from_tb,
     .restore_state_to_opc = x86_restore_state_to_opc,
     .cpu_exec_enter = x86_cpu_exec_enter,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps i386_tr_ops = {
     .tb_stop            = i386_tr_tb_stop,
 };
 
-/* generate intermediate code for basic block 'tb'.  */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void x86_translate_code(CPUState *cpu, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
 
diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 
 static const TCGCPUOps loongarch_tcg_ops = {
     .initialize = loongarch_translate_init,
+    .translate_code = loongarch_translate_code,
     .synchronize_from_tb = loongarch_cpu_synchronize_from_tb,
     .restore_state_to_opc = loongarch_restore_state_to_opc,
 
diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/tcg/translate.c
+++ b/target/loongarch/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps loongarch_tr_ops = {
     .tb_stop            = loongarch_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
+                              int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps m68k_sysemu_ops = {
 
 static const TCGCPUOps m68k_tcg_ops = {
     .initialize = m68k_tcg_init,
+    .translate_code = m68k_translate_code,
     .restore_state_to_opc = m68k_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps m68k_tr_ops = {
     .tb_stop            = m68k_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void m68k_translate_code(CPUState *cpu, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &m68k_tr_ops, &dc.base);
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mb_sysemu_ops = {
 
 static const TCGCPUOps mb_tcg_ops = {
     .initialize = mb_tcg_init,
+    .translate_code = mb_translate_code,
     .synchronize_from_tb = mb_cpu_synchronize_from_tb,
     .restore_state_to_opc = mb_restore_state_to_opc,
 
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mb_tr_ops = {
     .tb_stop            = mb_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void mb_translate_code(CPUState *cpu, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &mb_tr_ops, &dc.base);
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static const Property mips_cpu_properties[] = {
 #include "hw/core/tcg-cpu-ops.h"
 static const TCGCPUOps mips_tcg_ops = {
     .initialize = mips_tcg_init,
+    .translate_code = mips_translate_code,
     .synchronize_from_tb = mips_cpu_synchronize_from_tb,
     .restore_state_to_opc = mips_restore_state_to_opc,
 
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mips_tr_ops = {
     .tb_stop            = mips_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void mips_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps openrisc_sysemu_ops = {
 
 static const TCGCPUOps openrisc_tcg_ops = {
     .initialize = openrisc_translate_init,
+    .translate_code = openrisc_translate_code,
     .synchronize_from_tb = openrisc_cpu_synchronize_from_tb,
     .restore_state_to_opc = openrisc_restore_state_to_opc,
 
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps openrisc_tr_ops = {
     .tb_stop            = openrisc_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
+                             int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps ppc_sysemu_ops = {
 
 static const TCGCPUOps ppc_tcg_ops = {
   .initialize = ppc_translate_init,
+  .translate_code = ppc_translate_code,
   .restore_state_to_opc = ppc_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps ppc_tr_ops = {
     .tb_stop            = ppc_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/tcg/tcg-cpu.c
+++ b/target/riscv/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_restore_state_to_opc(CPUState *cs,
 
 static const TCGCPUOps riscv_tcg_ops = {
     .initialize = riscv_translate_init,
+    .translate_code = riscv_translate_code,
     .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
     .restore_state_to_opc = riscv_restore_state_to_opc,
 
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps riscv_tr_ops = {
     .tb_stop            = riscv_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps rx_sysemu_ops = {
 
 static const TCGCPUOps rx_tcg_ops = {
     .initialize = rx_translate_init,
+    .translate_code = rx_translate_code,
     .synchronize_from_tb = rx_cpu_synchronize_from_tb,
     .restore_state_to_opc = rx_restore_state_to_opc,
     .tlb_fill = rx_cpu_tlb_fill,
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps rx_tr_ops = {
     .tb_stop            = rx_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void rx_translate_code(CPUState *cs, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
 
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ void cpu_get_tb_cpu_state(CPUS390XState *env, vaddr *pc,
 
 static const TCGCPUOps s390_tcg_ops = {
     .initialize = s390x_translate_init,
+    .translate_code = s390x_translate_code,
     .restore_state_to_opc = s390x_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps s390x_tr_ops = {
     .disas_log          = s390x_tr_disas_log,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
 
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sh4_sysemu_ops = {
 
 static const TCGCPUOps superh_tcg_ops = {
     .initialize = sh4_translate_init,
+    .translate_code = sh4_translate_code,
     .synchronize_from_tb = superh_cpu_synchronize_from_tb,
     .restore_state_to_opc = superh_restore_state_to_opc,
 
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sh4_tr_ops = {
     .tb_stop            = sh4_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sparc_sysemu_ops = {
 
 static const TCGCPUOps sparc_tcg_ops = {
     .initialize = sparc_tcg_init,
+    .translate_code = sparc_translate_code,
     .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
     .restore_state_to_opc = sparc_restore_state_to_opc,
 
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sparc_tr_ops = {
     .tb_stop            = sparc_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = {};
 
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps tricore_sysemu_ops = {
 
 static const TCGCPUOps tricore_tcg_ops = {
     .initialize = tricore_tcg_init,
+    .translate_code = tricore_translate_code,
     .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
     .restore_state_to_opc = tricore_restore_state_to_opc,
     .tlb_fill = tricore_cpu_tlb_fill,
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps tricore_tr_ops = {
     .tb_stop            = tricore_tr_tb_stop,
 };
 
-
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
     translator_loop(cs, tb, max_insns, pc, host_pc,
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps xtensa_sysemu_ops = {
 
 static const TCGCPUOps xtensa_tcg_ops = {
     .initialize = xtensa_translate_init,
+    .translate_code = xtensa_translate_code,
     .debug_excp_handler = xtensa_breakpoint_handler,
     .restore_state_to_opc = xtensa_restore_state_to_opc,
 
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps xtensa_translator_ops = {
     .tb_stop            = xtensa_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void xtensa_translate_code(CPUState *cpu, TranslationBlock *tb,
+                           int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = {};
     translator_loop(cpu, tb, max_insns, pc, host_pc,
-- 
2.43.0