Series comparison

-[PULL 00/24] tcg patch queue
+[PULL 00/72] tcg patch queue
-The following changes since commit 45240eed4f064576d589ea60ebadf3c11d7ab891:
+The following changes since commit aa3a285b5bc56a4208b3b57d4a55291e9c260107:
-  Merge remote-tracking branch 'remotes/armbru/tags/pull-yank-2021-01-13' into staging (2021-01-13 14:19:24 +0000)
+  Merge tag 'mem-2024-12-21' of https://github.com/davidhildenbrand/qemu into staging (2024-12-22 14:33:27 -0500)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210113
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241224
-for you to fetch changes up to 4cacecaaa2bbf8af0967bd3eee43297fada475a9:
+for you to fetch changes up to e4a8e093dc74be049f4829831dce76e5edab0003:
-  decodetree: Open files with encoding='utf-8' (2021-01-13 08:39:08 -1000)
+  accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core (2024-12-24 08:32:15 -0800)
 ----------------------------------------------------------------
-Improvements to tcg constant handling.
+tcg/optimize: Remove in-flight mask data from OptContext
-Force utf8 for decodetree.
+fpu: Add float*_muladd_scalbn
 fpu: Remove float_muladd_halve_result
 fpu: Add float_round_nearest_even_max
 fpu: Add float_muladd_suppress_add_product_zero
 target/hexagon: Use float32_muladd
 accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
 ----------------------------------------------------------------
-Philippe Mathieu-Daudé (1):
+Ilya Leoshkevich (1):
-      decodetree: Open files with encoding='utf-8'
+      tests/tcg: Do not use inttypes.h in multiarch/system/memory.c
-Richard Henderson (23):
+Pierrick Bouvier (1):
-      tcg: Use tcg_out_dupi_vec from temp_load
+      plugins: optimize cpu_index code generation
       tcg: Increase tcg_out_dupi_vec immediate to int64_t
       tcg: Consolidate 3 bits into enum TCGTempKind
       tcg: Add temp_readonly
       tcg: Expand TCGTemp.val to 64-bits
       tcg: Rename struct tcg_temp_info to TempOptInfo
       tcg: Expand TempOptInfo to 64-bits
       tcg: Introduce TYPE_CONST temporaries
       tcg/optimize: Improve find_better_copy
       tcg/optimize: Adjust TempOptInfo allocation
       tcg/optimize: Use tcg_constant_internal with constant folding
       tcg: Convert tcg_gen_dupi_vec to TCG_CONST
       tcg: Use tcg_constant_i32 with icount expander
       tcg: Use tcg_constant_{i32,i64} with tcg int expanders
       tcg: Use tcg_constant_{i32,i64} with tcg plugins
       tcg: Use tcg_constant_{i32,i64,vec} with gvec expanders
       tcg/tci: Add special tci_movi_{i32,i64} opcodes
       tcg: Remove movi and dupi opcodes
       tcg: Add tcg_reg_alloc_dup2
       tcg/i386: Use tcg_constant_vec with tcg vec expanders
       tcg: Remove tcg_gen_dup{8,16,32,64}i_vec
       tcg/ppc: Use tcg_constant_vec with tcg vec expanders
       tcg/aarch64: Use tcg_constant_vec with tcg vec expanders
- include/exec/gen-icount.h    |  25 +--
+Richard Henderson (70):
- include/tcg/tcg-op.h         |  17 +-
+      tcg/optimize: Split out finish_bb, finish_ebb
- include/tcg/tcg-opc.h        |  11 +-
+      tcg/optimize: Split out fold_affected_mask
- include/tcg/tcg.h            |  50 ++++-
+      tcg/optimize: Copy mask writeback to fold_masks
- accel/tcg/plugin-gen.c       |  49 ++---
+      tcg/optimize: Split out fold_masks_zs
- tcg/optimize.c               | 249 +++++++++++-----------
+      tcg/optimize: Augment s_mask from z_mask in fold_masks_zs
- tcg/tcg-op-gvec.c            | 129 +++++-------
+      tcg/optimize: Change representation of s_mask
- tcg/tcg-op-vec.c             |  52 +----
+      tcg/optimize: Use finish_folding in fold_add, fold_add_vec, fold_addsub2
- tcg/tcg-op.c                 | 227 ++++++++++----------
+      tcg/optimize: Introduce const value accessors for TempOptInfo
- tcg/tcg.c                    | 488 +++++++++++++++++++++++++++++++++----------
+      tcg/optimize: Use fold_masks_zs in fold_and
- tcg/tci.c                    |   4 +-
+      tcg/optimize: Use fold_masks_zs in fold_andc
- tcg/aarch64/tcg-target.c.inc |  32 +--
+      tcg/optimize: Use fold_masks_zs in fold_bswap
- tcg/arm/tcg-target.c.inc     |   1 -
+      tcg/optimize: Use fold_masks_zs in fold_count_zeros
- tcg/i386/tcg-target.c.inc    | 112 ++++++----
+      tcg/optimize: Use fold_masks_z in fold_ctpop
- tcg/mips/tcg-target.c.inc    |   2 -
+      tcg/optimize: Use fold_and and fold_masks_z in fold_deposit
- tcg/ppc/tcg-target.c.inc     |  90 ++++----
+      tcg/optimize: Compute sign mask in fold_deposit
- tcg/riscv/tcg-target.c.inc   |   2 -
+      tcg/optimize: Use finish_folding in fold_divide
- tcg/s390/tcg-target.c.inc    |   2 -
+      tcg/optimize: Use finish_folding in fold_dup, fold_dup2
- tcg/sparc/tcg-target.c.inc   |   2 -
+      tcg/optimize: Use fold_masks_s in fold_eqv
- tcg/tci/tcg-target.c.inc     |   6 +-
+      tcg/optimize: Use fold_masks_z in fold_extract
- scripts/decodetree.py        |   9 +-
+      tcg/optimize: Use finish_folding in fold_extract2
-files changed, 890 insertions(+), 669 deletions(-)
+      tcg/optimize: Use fold_masks_zs in fold_exts
       tcg/optimize: Use fold_masks_z in fold_extu
       tcg/optimize: Use fold_masks_zs in fold_movcond
       tcg/optimize: Use finish_folding in fold_mul*
       tcg/optimize: Use fold_masks_s in fold_nand
       tcg/optimize: Use fold_masks_z in fold_neg_no_const
       tcg/optimize: Use fold_masks_s in fold_nor
       tcg/optimize: Use fold_masks_s in fold_not
       tcg/optimize: Use fold_masks_zs in fold_or
       tcg/optimize: Use fold_masks_zs in fold_orc
       tcg/optimize: Use fold_masks_zs in fold_qemu_ld
       tcg/optimize: Return true from fold_qemu_st, fold_tcg_st
       tcg/optimize: Use finish_folding in fold_remainder
       tcg/optimize: Distinguish simplification in fold_setcond_zmask
       tcg/optimize: Use fold_masks_z in fold_setcond
       tcg/optimize: Use fold_masks_s in fold_negsetcond
       tcg/optimize: Use fold_masks_z in fold_setcond2
       tcg/optimize: Use finish_folding in fold_cmp_vec
       tcg/optimize: Use finish_folding in fold_cmpsel_vec
       tcg/optimize: Use fold_masks_zs in fold_sextract
       tcg/optimize: Use fold_masks_zs, fold_masks_s in fold_shift
       tcg/optimize: Simplify sign bit test in fold_shift
       tcg/optimize: Use finish_folding in fold_sub, fold_sub_vec
       tcg/optimize: Use fold_masks_zs in fold_tcg_ld
       tcg/optimize: Use finish_folding in fold_tcg_ld_memcopy
       tcg/optimize: Use fold_masks_zs in fold_xor
       tcg/optimize: Use finish_folding in fold_bitsel_vec
       tcg/optimize: Use finish_folding as default in tcg_optimize
       tcg/optimize: Remove z_mask, s_mask from OptContext
       tcg/optimize: Re-enable sign-mask optimizations
       tcg/optimize: Move fold_bitsel_vec into alphabetic sort
       tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
       softfloat: Add float{16,32,64}_muladd_scalbn
       target/arm: Use float*_muladd_scalbn
       target/sparc: Use float*_muladd_scalbn
       softfloat: Remove float_muladd_halve_result
       softfloat: Add float_round_nearest_even_max
       softfloat: Add float_muladd_suppress_add_product_zero
       target/hexagon: Use float32_mul in helper_sfmpy
       target/hexagon: Use float32_muladd for helper_sffma
       target/hexagon: Use float32_muladd for helper_sffms
       target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
       target/hexagon: Use float32_muladd for helper_sffm[as]_lib
       target/hexagon: Remove internal_fmafx
       target/hexagon: Expand GEN_XF_ROUND
       target/hexagon: Remove Float
       target/hexagon: Remove Double
       target/hexagon: Use mulu64 for int128_mul_6464
       target/hexagon: Simplify internal_mpyhh setup
       accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
+ include/exec/translator.h           |  14 -
+ include/fpu/softfloat-types.h       |   2 +
+ include/fpu/softfloat.h             |  14 +-
+ include/hw/core/tcg-cpu-ops.h       |  13 +
+ target/alpha/cpu.h                  |   2 +
+ target/arm/internals.h              |   2 +
+ target/avr/cpu.h                    |   2 +
+ target/hexagon/cpu.h                |   2 +
+ target/hexagon/fma_emu.h            |   3 -
+ target/hppa/cpu.h                   |   2 +
+ target/i386/tcg/helper-tcg.h        |   2 +
+ target/loongarch/internals.h        |   2 +
+ target/m68k/cpu.h                   |   2 +
+ target/microblaze/cpu.h             |   2 +
+ target/mips/tcg/tcg-internal.h      |   2 +
+ target/openrisc/cpu.h               |   2 +
+ target/ppc/cpu.h                    |   2 +
+ target/riscv/cpu.h                  |   3 +
+ target/rx/cpu.h                     |   2 +
+ target/s390x/s390x-internal.h       |   2 +
+ target/sh4/cpu.h                    |   2 +
+ target/sparc/cpu.h                  |   2 +
+ target/sparc/helper.h               |   4 +-
+ target/tricore/cpu.h                |   2 +
+ target/xtensa/cpu.h                 |   2 +
+ accel/tcg/cpu-exec.c                |   8 +-
+ accel/tcg/plugin-gen.c              |   9 +
+ accel/tcg/translate-all.c           |   8 +-
+ fpu/softfloat.c                     |  63 +--
+ target/alpha/cpu.c                  |   1 +
+ target/alpha/translate.c            |   4 +-
+ target/arm/cpu.c                    |   1 +
+ target/arm/tcg/cpu-v7m.c            |   1 +
+ target/arm/tcg/helper-a64.c         |   6 +-
+ target/arm/tcg/translate.c          |   5 +-
+ target/avr/cpu.c                    |   1 +
+ target/avr/translate.c              |   6 +-
+ target/hexagon/cpu.c                |   1 +
+ target/hexagon/fma_emu.c            | 496 ++++++---------------
+ target/hexagon/op_helper.c          | 125 ++----
+ target/hexagon/translate.c          |   4 +-
+ target/hppa/cpu.c                   |   1 +
+ target/hppa/translate.c             |   4 +-
+ target/i386/tcg/tcg-cpu.c           |   1 +
+ target/i386/tcg/translate.c         |   5 +-
+ target/loongarch/cpu.c              |   1 +
+ target/loongarch/tcg/translate.c    |   4 +-
+ target/m68k/cpu.c                   |   1 +
+ target/m68k/translate.c             |   4 +-
+ target/microblaze/cpu.c             |   1 +
+ target/microblaze/translate.c       |   4 +-
+ target/mips/cpu.c                   |   1 +
+ target/mips/tcg/translate.c         |   4 +-
+ target/openrisc/cpu.c               |   1 +
+ target/openrisc/translate.c         |   4 +-
+ target/ppc/cpu_init.c               |   1 +
+ target/ppc/translate.c              |   4 +-
+ target/riscv/tcg/tcg-cpu.c          |   1 +
+ target/riscv/translate.c            |   4 +-
+ target/rx/cpu.c                     |   1 +
+ target/rx/translate.c               |   4 +-
+ target/s390x/cpu.c                  |   1 +
+ target/s390x/tcg/translate.c        |   4 +-
+ target/sh4/cpu.c                    |   1 +
+ target/sh4/translate.c              |   4 +-
+ target/sparc/cpu.c                  |   1 +
+ target/sparc/fop_helper.c           |   8 +-
+ target/sparc/translate.c            |  84 ++--
+ target/tricore/cpu.c                |   1 +
+ target/tricore/translate.c          |   5 +-
+ target/xtensa/cpu.c                 |   1 +
+ target/xtensa/translate.c           |   4 +-
+ tcg/optimize.c                      | 857 +++++++++++++++++++-----------------
+ tests/tcg/multiarch/system/memory.c |   9 +-
+ fpu/softfloat-parts.c.inc           |  16 +-
+files changed, 866 insertions(+), 1009 deletions(-)

-New patch
+[PULL 01/72] tests/tcg: Do not use inttypes.h in multiarch/system/memory.c
+From: Ilya Leoshkevich <iii@linux.ibm.com>
+make check-tcg fails on Fedora with the following error message:
+    alpha-linux-gnu-gcc [...] qemu/tests/tcg/multiarch/system/memory.c -o memory [...]
+    qemu/tests/tcg/multiarch/system/memory.c:17:10: fatal error: inttypes.h: No such file or directory
+| #include <inttypes.h>
+          |          ^~~~~~~~~~~~
+    compilation terminated.
+The reason is that Fedora has cross-compilers, but no cross-glibc
+headers. Fix by hardcoding the format specifiers and dropping the
+include.
+An alternative fix would be to introduce a configure check for
+inttypes.h. But this would make it impossible to use Fedora
+cross-compilers for softmmu tests, which used to work so far.
+Fixes: ecbcc9ead2f8 ("tests/tcg: add a system test to check memory instrumentation")
+Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-ID: <20241010085906.226249-1-iii@linux.ibm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tests/tcg/multiarch/system/memory.c | 9 ++++-----
+file changed, 4 insertions(+), 5 deletions(-)
+diff --git a/tests/tcg/multiarch/system/memory.c b/tests/tcg/multiarch/system/memory.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tests/tcg/multiarch/system/memory.c
++++ b/tests/tcg/multiarch/system/memory.c
+@@ -XXX,XX +XXX,XX @@
+ #include <stdint.h>
+ #include <stdbool.h>
+-#include <inttypes.h>
+ #include <minilib.h>
+ #ifndef CHECK_UNALIGNED
+@@ -XXX,XX +XXX,XX @@ int main(void)
+     int i;
+     bool ok = true;
+-    ml_printf("Test data start: 0x%"PRIxPTR"\n", &test_data[0]);
+-    ml_printf("Test data end: 0x%"PRIxPTR"\n", &test_data[TEST_SIZE]);
++    ml_printf("Test data start: 0x%lx\n", (unsigned long)&test_data[0]);
++    ml_printf("Test data end: 0x%lx\n", (unsigned long)&test_data[TEST_SIZE]);
+     /* Run through the unsigned tests first */
+     for (i = 0; i < ARRAY_SIZE(init_ufns) && ok; i++) {
+@@ -XXX,XX +XXX,XX @@ int main(void)
+         ok = do_signed_reads(true);
+     }
+-    ml_printf("Test data read: %"PRId32"\n", test_read_count);
+-    ml_printf("Test data write: %"PRId32"\n", test_write_count);
++    ml_printf("Test data read: %lu\n", (unsigned long)test_read_count);
++    ml_printf("Test data write: %lu\n", (unsigned long)test_write_count);
+     ml_printf("Test complete: %s\n", ok ? "PASSED" : "FAILED");
+     return ok ? 0 : -1;
+ }
+--
+.43.0

-New patch
+[PULL 02/72] plugins: optimize cpu_index code generation
+From: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+When running with a single vcpu, we can return a constant instead of a
+load when accessing cpu_index.
+A side effect is that all tcg operations using it are optimized, most
+notably scoreboard access.
+When running a simple loop in user-mode, the speedup is around 20%.
+Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Message-ID: <20241128213843.1023080-1-pierrick.bouvier@linaro.org>
+---
+ accel/tcg/plugin-gen.c | 9 +++++++++
+file changed, 9 insertions(+)
+diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
+index XXXXXXX..XXXXXXX 100644
+--- a/accel/tcg/plugin-gen.c
++++ b/accel/tcg/plugin-gen.c
+@@ -XXX,XX +XXX,XX @@ static void gen_disable_mem_helper(void)
+ static TCGv_i32 gen_cpu_index(void)
+ {
++    /*
++     * Optimize when we run with a single vcpu. All values using cpu_index,
++     * including scoreboard index, will be optimized out.
++     * User-mode calls tb_flush when setting this flag. In system-mode, all
++     * vcpus are created before generating code.
++     */
++    if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) {
++        return tcg_constant_i32(current_cpu->cpu_index);
++    }
+     TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
+     tcg_gen_ld_i32(cpu_index, tcg_env,
+                    -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
+--
+.43.0

-[PULL 01/24] tcg: Use tcg_out_dupi_vec from temp_load
+[PULL 03/72] tcg/optimize: Split out finish_bb, finish_ebb
-Having dupi pass though movi is confusing and arguably wrong.
+Call them directly from the opcode switch statement in tcg_optimize,
 rather than in finish_folding based on opcode flags.  Adjust folding
 of conditional branches to match.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c                    |  6 +++-
+ tcg/optimize.c | 47 +++++++++++++++++++++++++++++++----------------
- tcg/aarch64/tcg-target.c.inc |  7 ----
+file changed, 31 insertions(+), 16 deletions(-)
  tcg/i386/tcg-target.c.inc    | 63 ++++++++++++++++++++++++------------
  tcg/ppc/tcg-target.c.inc     |  6 ----
 files changed, 47 insertions(+), 35 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
      case TEMP_VAL_CONST:
          reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
                              preferred_regs, ts->indirect_base);
 -        tcg_out_movi(s, ts->type, reg, ts->val);
 +        if (ts->type <= TCG_TYPE_I64) {
 +            tcg_out_movi(s, ts->type, reg, ts->val);
 +        } else {
 +            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
 +        }
          ts->mem_coherent = 0;
          break;
      case TEMP_VAL_MEM:
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
      case TCG_TYPE_I64:
          tcg_debug_assert(rd < 32);
          break;
 -
 -    case TCG_TYPE_V64:
 -    case TCG_TYPE_V128:
 -        tcg_debug_assert(rd >= 32);
 -        tcg_out_dupi_vec(s, type, rd, value);
 -        return;
 -
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
      }
  }
--static void tcg_out_movi(TCGContext *s, TCGType type,
++static void finish_bb(OptContext *ctx)
 -                         TCGReg ret, tcg_target_long arg)
 +static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 +                             TCGReg ret, tcg_target_long arg)
 +{
-+    if (arg == 0) {
++    /* We only optimize memory barriers across basic blocks. */
-+        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
++    ctx->prev_mb = NULL;
 +        return;
 +    }
 +    if (arg == -1) {
 +        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 +        return;
 +    }
 +
 +    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 +    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 +    } else {
 +        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 +    }
 +}
 +
-+static void tcg_out_movi_int(TCGContext *s, TCGType type,
++static void finish_ebb(OptContext *ctx)
-+                             TCGReg ret, tcg_target_long arg)
++{
 +    finish_bb(ctx);
 +    /* We only optimize across extended basic blocks. */
 +    memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
 +    remove_mem_copy_all(ctx);
 +}
 +
  static void finish_folding(OptContext *ctx, TCGOp *op)
  {
-     tcg_target_long diff;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+     int i, nb_oargs;
--    switch (type) {
--    case TCG_TYPE_I32:
+-    /*
--#if TCG_TARGET_REG_BITS == 64
+-     * We only optimize extended basic blocks.  If the opcode ends a BB
--    case TCG_TYPE_I64:
+-     * and is not a conditional branch, reset all temp data.
--#endif
+-     */
--        if (ret < 16) {
+-    if (def->flags & TCG_OPF_BB_END) {
--            break;
+-        ctx->prev_mb = NULL;
 -        if (!(def->flags & TCG_OPF_COND_BRANCH)) {
 -            memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
 -            remove_mem_copy_all(ctx);
 -        }
--        /* fallthru */
--    case TCG_TYPE_V64:
--    case TCG_TYPE_V128:
--    case TCG_TYPE_V256:
--        tcg_debug_assert(ret >= 16);
--        tcg_out_dupi_vec(s, type, ret, arg);
 -        return;
--    default:
--        g_assert_not_reached();
 -    }
 -
-     if (arg == 0) {
+     nb_oargs = def->nb_oargs;
-         tgen_arithr(s, ARITH_XOR, ret, ret);
+     for (i = 0; i < nb_oargs; i++) {
-         return;
+         TCGTemp *ts = arg_temp(op->args[i]);
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
-     tcg_out64(s, arg);
+     if (i > 0) {
          op->opc = INDEX_op_br;
          op->args[0] = op->args[3];
 +        finish_ebb(ctx);
 +    } else {
 +        finish_bb(ctx);
      }
 -    return false;
 +    return true;
  }
-+static void tcg_out_movi(TCGContext *s, TCGType type,
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
-+                         TCGReg ret, tcg_target_long arg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
-+{
+         }
-+    switch (type) {
+         op->opc = INDEX_op_br;
-+    case TCG_TYPE_I32:
+         op->args[0] = label;
-+#if TCG_TARGET_REG_BITS == 64
+-        break;
-+    case TCG_TYPE_I64:
++        finish_ebb(ctx);
-+#endif
++        return true;
-+        if (ret < 16) {
+     }
-+            tcg_out_movi_int(s, type, ret, arg);
+-    return false;
 +        } else {
 +            tcg_out_movi_vec(s, type, ret, arg);
 +        }
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +}
 +
- static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
++    finish_bb(ctx);
- {
++    return true;
-     if (val == (int8_t)val) {
+ }
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
---- a/tcg/ppc/tcg-target.c.inc
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+++ b/tcg/ppc/tcg-target.c.inc
+         CASE_OP_32_64_VEC(xor):
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
+             done = fold_xor(&ctx, op);
-         tcg_out_movi_int(s, type, ret, arg, false);
+             break;
-         break;
++        case INDEX_op_set_label:
++        case INDEX_op_br:
--    case TCG_TYPE_V64:
++        case INDEX_op_exit_tb:
--    case TCG_TYPE_V128:
++        case INDEX_op_goto_tb:
--        tcg_debug_assert(ret >= TCG_REG_V0);
++        case INDEX_op_goto_ptr:
--        tcg_out_dupi_vec(s, type, ret, arg);
++            finish_ebb(&ctx);
--        break;
++            done = true;
--
++            break;
-     default:
+         default:
-         g_assert_not_reached();
+             break;
-     }
+         }
 --
-.25.1
+.43.0

-New patch
+[PULL 04/72] tcg/optimize: Split out fold_affected_mask
+There are only a few logical operations which can compute
+an "affected" mask.  Split out handling of this optimization
+to a separate function, only to be called when applicable.
+Remove the a_mask field from OptContext, as the mask is
+no longer stored anywhere.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 42 +++++++++++++++++++++++++++---------------
+file changed, 27 insertions(+), 15 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
+     /* In flight values from optimization. */
+-    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+     uint64_t s_mask;  /* mask of clrsb(value) bits */
+     TCGType type;
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t a_mask = ctx->a_mask;
+     uint64_t z_mask = ctx->z_mask;
+     uint64_t s_mask = ctx->s_mask;
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+      * type changing opcodes.
+      */
+     if (ctx->type == TCG_TYPE_I32) {
+-        a_mask = (int32_t)a_mask;
+         z_mask = (int32_t)z_mask;
+         s_mask |= MAKE_64BIT_MASK(32, 32);
+         ctx->z_mask = z_mask;
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+     if (z_mask == 0) {
+         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+     }
++    return false;
++}
++
++/*
++ * An "affected" mask bit is 0 if and only if the result is identical
++ * to the first input.  Thus if the entire mask is 0, the operation
++ * is equivalent to a copy.
++ */
++static bool fold_affected_mask(OptContext *ctx, TCGOp *op, uint64_t a_mask)
++{
++    if (ctx->type == TCG_TYPE_I32) {
++        a_mask = (uint32_t)a_mask;
++    }
+     if (a_mask == 0) {
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+      * Known-zeros does not imply known-ones.  Therefore unless
+      * arg2 is constant, we can't infer affected bits from it.
+      */
+-    if (arg_is_const(op->args[2])) {
+-        ctx->a_mask = z1 & ~z2;
++    if (arg_is_const(op->args[2]) &&
++        fold_affected_mask(ctx, op, z1 & ~z2)) {
++        return true;
+     }
+     return fold_masks(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+      */
+     if (arg_is_const(op->args[2])) {
+         uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+-        ctx->a_mask = z1 & ~z2;
++        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
++            return true;
++        }
+         z1 &= z2;
+     }
+     ctx->z_mask = z1;
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
+     z_mask_old = arg_info(op->args[1])->z_mask;
+     z_mask = extract64(z_mask_old, pos, len);
+-    if (pos == 0) {
+-        ctx->a_mask = z_mask_old ^ z_mask;
++    if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
++        return true;
+     }
+     ctx->z_mask = z_mask;
+     ctx->s_mask = smask_from_zmask(z_mask);
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
+     ctx->z_mask = z_mask;
+     ctx->s_mask = s_mask;
+-    if (!type_change) {
+-        ctx->a_mask = s_mask & ~s_mask_old;
++    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
++        return true;
+     }
+     return fold_masks(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
+     ctx->z_mask = z_mask;
+     ctx->s_mask = smask_from_zmask(z_mask);
+-    if (!type_change) {
+-        ctx->a_mask = z_mask_old ^ z_mask;
++    if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
++        return true;
+     }
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+     ctx->s_mask = s_mask;
+-    if (pos == 0) {
+-        ctx->a_mask = s_mask & ~s_mask_old;
++    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
++        return true;
+     }
+     return fold_masks(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Assume all bits affected, no bits known zero, no sign reps. */
+-        ctx.a_mask = -1;
+         ctx.z_mask = -1;
+         ctx.s_mask = 0;
+--
+.43.0

-New patch
+[PULL 05/72] tcg/optimize: Copy mask writeback to fold_masks
+Use of fold_masks should be restricted to those opcodes that
+can reliably make use of it -- those with a single output,
+and from higher-level folders that set up the masks.
+Prepare for conversion of each folder in turn.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 17 ++++++++++++++---
+file changed, 14 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask = ctx->z_mask;
+     uint64_t s_mask = ctx->s_mask;
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
++    TCGTemp *ts;
++    TempOptInfo *ti;
++
++    /* Only single-output opcodes are supported here. */
++    tcg_debug_assert(def->nb_oargs == 1);
+     /*
+      * 32-bit ops generate 32-bit results, which for the purpose of
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+     if (ctx->type == TCG_TYPE_I32) {
+         z_mask = (int32_t)z_mask;
+         s_mask |= MAKE_64BIT_MASK(32, 32);
+-        ctx->z_mask = z_mask;
+-        ctx->s_mask = s_mask;
+     }
+     if (z_mask == 0) {
+         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+     }
+-    return false;
++
++    ts = arg_temp(op->args[0]);
++    reset_ts(ctx, ts);
++
++    ti = ts_info(ts);
++    ti->z_mask = z_mask;
++    ti->s_mask = s_mask;
++    return true;
+ }
+ /*
+--
+.43.0

-New patch
+[PULL 06/72] tcg/optimize: Split out fold_masks_zs
+Add a routine to which masks can be passed directly, rather than
+storing them into OptContext.  To be used in upcoming patches.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 15 ++++++++++++---
+file changed, 12 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
+-static bool fold_masks(OptContext *ctx, TCGOp *op)
++/*
++ * Record "zero" and "sign" masks for the single output of @op.
++ * See TempOptInfo definition of z_mask and s_mask.
++ * If z_mask allows, fold the output to constant zero.
++ */
++static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
++                          uint64_t z_mask, uint64_t s_mask)
+ {
+-    uint64_t z_mask = ctx->z_mask;
+-    uint64_t s_mask = ctx->s_mask;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+     TCGTemp *ts;
+     TempOptInfo *ti;
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_masks(OptContext *ctx, TCGOp *op)
++{
++    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
++}
++
+ /*
+  * An "affected" mask bit is 0 if and only if the result is identical
+  * to the first input.  Thus if the entire mask is 0, the operation
+--
+.43.0

-[PULL 16/24] tcg: Use tcg_constant_{i32,i64,vec} with gvec expanders
+[PULL 07/72] tcg/optimize: Augment s_mask from z_mask in fold_masks_zs
+Consider the passed s_mask to be a minimum deduced from
+either existing s_mask or from a sign-extension operation.
+We may be able to deduce more from the set of known zeros.
+Remove identical logic from several opcode folders.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |   1 +
+ tcg/optimize.c | 21 ++++++---------------
- tcg/tcg-op-gvec.c | 129 ++++++++++++++++++----------------------------
+file changed, 6 insertions(+), 15 deletions(-)
  tcg/tcg.c         |   8 +++
 files changed, 60 insertions(+), 78 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline TCGv_i64 tcg_constant_i64(int64_t val)
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
   * Record "zero" and "sign" masks for the single output of @op.
   * See TempOptInfo definition of z_mask and s_mask.
   * If z_mask allows, fold the output to constant zero.
 + * The passed s_mask may be augmented by z_mask.
   */
  static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
                            uint64_t z_mask, uint64_t s_mask)
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
      ti = ts_info(ts);
      ti->z_mask = z_mask;
 -    ti->s_mask = s_mask;
 +    ti->s_mask = s_mask | smask_from_zmask(z_mask);
      return true;
  }
- TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
-+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val);
+     default:
+         g_assert_not_reached();
- #if UINTPTR_MAX == UINT32_MAX
+     }
- # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
+-    s_mask = smask_from_zmask(z_mask);
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
-index XXXXXXX..XXXXXXX 100644
++    s_mask = 0;
---- a/tcg/tcg-op-gvec.c
+     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-+++ b/tcg/tcg-op-gvec.c
+     case TCG_BSWAP_OZ:
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+         break;
-                         gen_helper_gvec_2 *fn)
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
- {
+     default:
-     TCGv_ptr a0, a1;
+         /* The high bits are undefined: force all bits above the sign to 1. */
--    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+         z_mask |= sign << 1;
-+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
+-        s_mask = 0;
+         break;
-     a0 = tcg_temp_new_ptr();
+     }
-     a1 = tcg_temp_new_ptr();
+     ctx->z_mask = z_mask;
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+         g_assert_not_reached();
-     tcg_temp_free_ptr(a0);
+     }
-     tcg_temp_free_ptr(a1);
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
--    tcg_temp_free_i32(desc);
+-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
      return false;
  }
- /* Generate a call to a gvec-style helper with two vector operands
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
+     default:
-                          gen_helper_gvec_2i *fn)
+         g_assert_not_reached();
- {
+     }
-     TCGv_ptr a0, a1;
+-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
--    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+     return false;
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
 -    tcg_temp_free_i32(desc);
  }
- /* Generate a call to a gvec-style helper with three vector operands.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+         return true;
-                         gen_helper_gvec_3 *fn)
+     }
- {
+     ctx->z_mask = z_mask;
-     TCGv_ptr a0, a1, a2;
+-    ctx->s_mask = smask_from_zmask(z_mask);
--    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
-+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
+     return fold_masks(ctx, op);
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
 -    tcg_temp_free_i32(desc);
  }
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
  /* Generate a call to a gvec-style helper with four vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          int32_t data, gen_helper_gvec_4 *fn)
  {
      TCGv_ptr a0, a1, a2, a3;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with five vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
  {
      TCGv_ptr a0, a1, a2, a3, a4;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
      tcg_temp_free_ptr(a4);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
                          int32_t data, gen_helper_gvec_2_ptr *fn)
  {
      TCGv_ptr a0, a1;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          int32_t data, gen_helper_gvec_3_ptr *fn)
  {
      TCGv_ptr a0, a1, a2;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with four vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          gen_helper_gvec_4_ptr *fn)
  {
      TCGv_ptr a0, a1, a2, a3;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with five vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          gen_helper_gvec_5_ptr *fn)
  {
      TCGv_ptr a0, a1, a2, a3, a4;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
      tcg_temp_free_ptr(a4);
 -    tcg_temp_free_i32(desc);
  }
  /* Return true if we want to implement something of OPRSZ bytes
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
                  || (TCG_TARGET_REG_BITS == 64
                      && (in_c == 0 || in_c == -1
                          || !check_size_impl(oprsz, 4)))) {
 -                t_64 = tcg_const_i64(in_c);
 +                t_64 = tcg_constant_i64(in_c);
              } else {
 -                t_32 = tcg_const_i32(in_c);
 +                t_32 = tcg_constant_i32(in_c);
              }
          }
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
              t_val = tcg_temp_new_i32();
              tcg_gen_extrl_i64_i32(t_val, in_64);
          } else {
 -            t_val = tcg_const_i32(in_c);
 +            t_val = tcg_constant_i32(in_c);
          }
          gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 -        if (!in_32) {
 +        if (in_64) {
              tcg_temp_free_i32(t_val);
          }
          tcg_temp_free_ptr(t_size);
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
          return;
      }
--    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
+     ctx->z_mask = z_mask;
-+    t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
+-    ctx->s_mask = smask_from_zmask(z_mask);
+     if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
-     if (vece == MO_64) {
+         return true;
-         if (in_64) {
+     }
-             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-         } else {
+     int width = 8 * memop_size(mop);
--            t_64 = tcg_const_i64(in_c);
-+            t_64 = tcg_constant_i64(in_c);
+     if (width < 64) {
-             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
+-        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
--            tcg_temp_free_i64(t_64);
+-        if (!(mop & MO_SIGN)) {
-         }
++        if (mop & MO_SIGN) {
-     } else {
++            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
          typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
          if (in_32) {
              fns[vece](t_ptr, t_desc, in_32);
 -        } else {
 +        } else if (in_64) {
              t_32 = tcg_temp_new_i32();
 -            if (in_64) {
 -                tcg_gen_extrl_i64_i32(t_32, in_64);
 -            } else if (vece == MO_8) {
 -                tcg_gen_movi_i32(t_32, in_c & 0xff);
 -            } else if (vece == MO_16) {
 -                tcg_gen_movi_i32(t_32, in_c & 0xffff);
 -            } else {
 -                tcg_gen_movi_i32(t_32, in_c);
 -            }
 +            tcg_gen_extrl_i64_i32(t_32, in_64);
              fns[vece](t_ptr, t_desc, t_32);
              tcg_temp_free_i32(t_32);
 +        } else {
-+            if (vece == MO_8) {
+             ctx->z_mask = MAKE_64BIT_MASK(0, width);
-+                in_c &= 0xff;
+-            ctx->s_mask <<= 1;
 +            } else if (vece == MO_16) {
 +                in_c &= 0xffff;
 +            }
 +            t_32 = tcg_constant_i32(in_c);
 +            fns[vece](t_ptr, t_desc, t_32);
          }
      }
-     tcg_temp_free_ptr(t_ptr);
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
--    tcg_temp_free_i32(t_desc);
+     fold_setcond_tst_pow2(ctx, op, false);
-     return;
+     ctx->z_mask = 1;
-  done:
+-    ctx->s_mask = smask_from_zmask(1);
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+     return false;
              if (g->fno) {
                  tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
              } else {
 -                TCGv_i64 tcg_c = tcg_const_i64(c);
 +                TCGv_i64 tcg_c = tcg_constant_i64(c);
                  tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
                                      maxsz, c, g->fnoi);
 -                tcg_temp_free_i64(tcg_c);
              }
              oprsz = maxsz;
          }
@@ -XXX,XX +XXX,XX @@ static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
  void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
  {
 -    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
 +    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
      gen_addv_mask(d, a, b, m);
 -    tcg_temp_free_i64(m);
  }
- void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
- {
+     }
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
+     ctx->z_mask = 1;
-     gen_addv_mask(d, a, b, m);
+-    ctx->s_mask = smask_from_zmask(1);
--    tcg_temp_free_i64(m);
+     return false;
- }
+  do_setcond_const:
- void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
+         break;
- void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
+     CASE_OP_32_64(ld8u):
-                        int64_t c, uint32_t oprsz, uint32_t maxsz)
+         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
- {
+-        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
--    TCGv_i64 tmp = tcg_const_i64(c);
+         break;
-+    TCGv_i64 tmp = tcg_constant_i64(c);
+     CASE_OP_32_64(ld16s):
-     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
+         ctx->s_mask = MAKE_64BIT_MASK(16, 48);
--    tcg_temp_free_i64(tmp);
+         break;
- }
+     CASE_OP_32_64(ld16u):
+         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
- static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
+-        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
-@@ -XXX,XX +XXX,XX @@ static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+         break;
+     case INDEX_op_ld32s_i64:
- void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+         ctx->s_mask = MAKE_64BIT_MASK(32, 32);
- {
+         break;
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+     case INDEX_op_ld32u_i64:
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
+         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
-     gen_subv_mask(d, a, b, m);
+-        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
--    tcg_temp_free_i64(m);
+         break;
- }
+     default:
+         g_assert_not_reached();
  void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
  {
 -    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
 +    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
      gen_subv_mask(d, a, b, m);
 -    tcg_temp_free_i64(m);
  }
  void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
  void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
                         int64_t c, uint32_t oprsz, uint32_t maxsz)
  {
 -    TCGv_i64 tmp = tcg_const_i64(c);
 +    TCGv_i64 tmp = tcg_constant_i64(c);
      tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
 -    tcg_temp_free_i64(tmp);
  }
  void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
  static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
  {
 -    TCGv_i32 max = tcg_const_i32(-1);
 +    TCGv_i32 max = tcg_constant_i32(-1);
      tcg_gen_add_i32(d, a, b);
      tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
 -    tcg_temp_free_i32(max);
  }
  static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
  {
 -    TCGv_i64 max = tcg_const_i64(-1);
 +    TCGv_i64 max = tcg_constant_i64(-1);
      tcg_gen_add_i64(d, a, b);
      tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
 -    tcg_temp_free_i64(max);
  }
  void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
  static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
  {
 -    TCGv_i32 min = tcg_const_i32(0);
 +    TCGv_i32 min = tcg_constant_i32(0);
      tcg_gen_sub_i32(d, a, b);
      tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
 -    tcg_temp_free_i32(min);
  }
  static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
  {
 -    TCGv_i64 min = tcg_const_i64(0);
 +    TCGv_i64 min = tcg_constant_i64(0);
      tcg_gen_sub_i64(d, a, b);
      tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
 -    tcg_temp_free_i64(min);
  }
  void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
  void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
  {
 -    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
 +    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
      gen_negv_mask(d, b, m);
 -    tcg_temp_free_i64(m);
  }
  void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
  {
 -    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
 +    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
      gen_negv_mask(d, b, m);
 -    tcg_temp_free_i64(m);
  }
  void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
  void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
                         int64_t c, uint32_t oprsz, uint32_t maxsz)
  {
 -    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
 +    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
      tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
 -    tcg_temp_free_i64(tmp);
  }
  static const GVecGen2s gop_xors = {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
  void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
                         int64_t c, uint32_t oprsz, uint32_t maxsz)
  {
 -    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
 +    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
      tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
 -    tcg_temp_free_i64(tmp);
  }
  static const GVecGen2s gop_ors = {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
  void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
  {
 -    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
 +    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
      tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
 -    tcg_temp_free_i64(tmp);
  }
  void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
  {
      TCGv_vec t = tcg_temp_new_vec_matching(d);
 +    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 -    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
 -    tcg_gen_and_vec(vece, t, t, b);
 +    tcg_gen_and_vec(vece, t, b, m);
      tcg_gen_shlv_vec(vece, d, a, t);
      tcg_temp_free_vec(t);
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
  {
      TCGv_vec t = tcg_temp_new_vec_matching(d);
 +    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 -    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
 -    tcg_gen_and_vec(vece, t, t, b);
 +    tcg_gen_and_vec(vece, t, b, m);
      tcg_gen_shrv_vec(vece, d, a, t);
      tcg_temp_free_vec(t);
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
  {
      TCGv_vec t = tcg_temp_new_vec_matching(d);
 +    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 -    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
 -    tcg_gen_and_vec(vece, t, t, b);
 +    tcg_gen_and_vec(vece, t, b, m);
      tcg_gen_sarv_vec(vece, d, a, t);
      tcg_temp_free_vec(t);
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
                                    TCGv_vec a, TCGv_vec b)
  {
      TCGv_vec t = tcg_temp_new_vec_matching(d);
 +    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 -    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
 -    tcg_gen_and_vec(vece, t, t, b);
 +    tcg_gen_and_vec(vece, t, b, m);
      tcg_gen_rotlv_vec(vece, d, a, t);
      tcg_temp_free_vec(t);
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
                                    TCGv_vec a, TCGv_vec b)
  {
      TCGv_vec t = tcg_temp_new_vec_matching(d);
 +    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 -    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
 -    tcg_gen_and_vec(vece, t, t, b);
 +    tcg_gen_and_vec(vece, t, b, m);
      tcg_gen_rotrv_vec(vece, d, a, t);
      tcg_temp_free_vec(t);
  }
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
      return temp_tcgv_vec(tcg_constant_internal(type, val));
  }
 +TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val)
 +{
 +    TCGTemp *t = tcgv_vec_temp(match);
 +
 +    tcg_debug_assert(t->temp_allocated != 0);
 +    return tcg_constant_vec(t->base_type, vece, val);
 +}
 +
  TCGv_i32 tcg_const_i32(int32_t val)
  {
      TCGv_i32 t0;
 --
-.25.1
+.43.0

-[PULL 07/24] tcg: Expand TempOptInfo to 64-bits
+[PULL 08/72] tcg/optimize: Change representation of s_mask
-This propagates the extended value of TCGTemp.val that we did before.
+Change the representation from sign bit repetitions to all bits equal
-In addition, it will be required for vector constants.
+to the sign bit, including the sign bit itself.
+The previous format has a problem in that it is difficult to recreate
+a valid sign mask after a shift operation: the "repetitions" part of
+the previous format meant that applying the same shift as for the value
+lead to an off-by-one value.
+The new format, including the sign bit itself, means that the sign mask
+can be manipulated in exactly the same way as the value, canonicalization
+is easier.
+Canonicalize the s_mask in fold_masks_zs, rather than requiring callers
+to do so.  Treat 0 as a non-canonical but typeless input for no sign
+information, which will be reset as appropriate for the data type.
+We can easily fold in the data from z_mask while canonicalizing.
+Temporarily disable optimizations using s_mask while each operation is
+converted to use fold_masks_zs and to the new form.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 40 +++++++++++++++++++++-------------------
+ tcg/optimize.c | 64 ++++++++++++--------------------------------------
-file changed, 21 insertions(+), 19 deletions(-)
+file changed, 15 insertions(+), 49 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
 @@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     bool is_const;
+     QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy;
-     TCGTemp *prev_copy;
+     uint64_t val;
-     TCGTemp *next_copy;
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
--    tcg_target_ulong val;
+-    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
--    tcg_target_ulong mask;
++    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
 +    uint64_t val;
 +    uint64_t mask;
  } TempOptInfo;
+ typedef struct OptContext {
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     /* In flight values from optimization. */
+     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+-    uint64_t s_mask;  /* mask of clrsb(value) bits */
++    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
+     TCGType type;
+ } OptContext;
+-/* Calculate the smask for a specific value. */
+-static uint64_t smask_from_value(uint64_t value)
+-{
+-    int rep = clrsb64(value);
+-    return ~(~0ull >> rep);
+-}
+-
+-/*
+- * Calculate the smask for a given set of known-zeros.
+- * If there are lots of zeros on the left, we can consider the remainder
+- * an unsigned field, and thus the corresponding signed field is one bit
+- * larger.
+- */
+-static uint64_t smask_from_zmask(uint64_t zmask)
+-{
+-    /*
+-     * Only the 0 bits are significant for zmask, thus the msb itself
+-     * must be zero, else we have no sign information.
+-     */
+-    int rep = clz64(zmask);
+-    if (rep == 0) {
+-        return 0;
+-    }
+-    rep -= 1;
+-    return ~(~0ull >> rep);
+-}
+-
+-/*
+- * Recreate a properly left-aligned smask after manipulation.
+- * Some bit-shuffling, particularly shifts and rotates, may
+- * retain sign bits on the left, but may scatter disconnected
+- * sign bits on the right.  Retain only what remains to the left.
+- */
+-static uint64_t smask_from_smask(int64_t smask)
+-{
+-    /* Only the 1 bits are significant for smask */
+-    return smask_from_zmask(~smask);
+-}
+-
  static inline TempOptInfo *ts_info(TCGTemp *ts)
-@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
- }
--static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
-+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
  {
-     const TCGOpDef *def;
+     return ts->state_ptr;
-     TCGOpcode new_op;
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
--    tcg_target_ulong mask;
+         ti->is_const = true;
-+    uint64_t mask;
+         ti->val = ts->val;
-     TempOptInfo *di = arg_info(dst);
+         ti->z_mask = ts->val;
+-        ti->s_mask = smask_from_value(ts->val);
-     def = &tcg_op_defs[op->opc];
++        ti->s_mask = INT64_MIN >> clrsb64(ts->val);
-@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     } else {
-     const TCGOpDef *def;
+         ti->is_const = false;
-     TempOptInfo *di;
+         ti->z_mask = -1;
-     TempOptInfo *si;
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
--    tcg_target_ulong mask;
+          */
-+    uint64_t mask;
+         if (i == 0) {
-     TCGOpcode new_op;
+             ts_info(ts)->z_mask = ctx->z_mask;
+-            ts_info(ts)->s_mask = ctx->s_mask;
-     if (ts_are_copies(dst_ts, src_ts)) {
+         }
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
--static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
+  * The passed s_mask may be augmented by z_mask.
-+static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+  */
  static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
 -                          uint64_t z_mask, uint64_t s_mask)
 +                          uint64_t z_mask, int64_t s_mask)
  {
-     uint64_t l64, h64;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+     TCGTemp *ts;
-@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
+     TempOptInfo *ti;
 +    int rep;
      /* Only single-output opcodes are supported here. */
      tcg_debug_assert(def->nb_oargs == 1);
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
       */
      if (ctx->type == TCG_TYPE_I32) {
          z_mask = (int32_t)z_mask;
 -        s_mask |= MAKE_64BIT_MASK(32, 32);
 +        s_mask |= INT32_MIN;
      }
+     if (z_mask == 0) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
+     ti = ts_info(ts);
+     ti->z_mask = z_mask;
+-    ti->s_mask = s_mask | smask_from_zmask(z_mask);
++
++    /* Canonicalize s_mask and incorporate data from z_mask. */
++    rep = clz64(~s_mask);
++    rep = MAX(rep, clz64(z_mask));
++    rep = MAX(rep - 1, 0);
++    ti->s_mask = INT64_MIN >> rep;
++
+     return true;
  }
--static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
-+static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
- {
+     ctx->z_mask = z_mask;
-     const TCGOpDef *def = &tcg_op_defs[op];
+     ctx->s_mask = s_mask;
--    TCGArg res = do_constant_folding_2(op, x, y);
+-    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-+    uint64_t res = do_constant_folding_2(op, x, y);
++    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-     if (!(def->flags & TCG_OPF_64BIT)) {
+         return true;
          res = (int32_t)res;
      }
-@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
- static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-                                        TCGArg y, TCGCond c)
+     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
- {
+     ctx->s_mask = s_mask;
--    tcg_target_ulong xv = arg_info(x)->val;
--    tcg_target_ulong yv = arg_info(y)->val;
+-    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-+    uint64_t xv = arg_info(x)->val;
++    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-+    uint64_t yv = arg_info(y)->val;
+         return true;
-+
+     }
-     if (arg_is_const(x) && arg_is_const(y)) {
-         const TCGOpDef *def = &tcg_op_defs[op];
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-         tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
+         ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
+         s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+-        ctx->s_mask = smask_from_smask(s_mask);
-     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
--        tcg_target_ulong mask, partmask, affected;
+         return fold_masks(ctx, op);
-+        uint64_t mask, partmask, affected, tmp;
+     }
          int nb_oargs, nb_iargs, i;
 -        TCGArg tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(extract2):
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                TCGArg v1 = arg_info(op->args[1])->val;
 -                TCGArg v2 = arg_info(op->args[2])->val;
 +                uint64_t v1 = arg_info(op->args[1])->val;
 +                uint64_t v2 = arg_info(op->args[2])->val;
 +                int shr = op->args[3];
                  if (opc == INDEX_op_extract2_i64) {
 -                    tmp = (v1 >> op->args[3]) | (v2 << (64 - op->args[3]));
 +                    tmp = (v1 >> shr) | (v2 << (64 - shr));
                  } else {
 -                    tmp = (int32_t)(((uint32_t)v1 >> op->args[3]) |
 -                                    ((uint32_t)v2 << (32 - op->args[3])));
 +                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
 +                                    ((uint32_t)v2 << (32 - shr)));
                  }
                  tcg_opt_gen_movi(s, op, op->args[0], tmp);
                  break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  break;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
 -                tcg_target_ulong tv = arg_info(op->args[3])->val;
 -                tcg_target_ulong fv = arg_info(op->args[4])->val;
 +                uint64_t tv = arg_info(op->args[3])->val;
 +                uint64_t fv = arg_info(op->args[4])->val;
                  TCGCond cond = op->args[5];
 +
                  if (fv == 1 && tv == 0) {
                      cond = tcg_invert_cond(cond);
                  } else if (!(tv == 1 && fv == 0)) {
 --
-.25.1
+.43.0

-[PULL 12/24] tcg: Convert tcg_gen_dupi_vec to TCG_CONST
+[PULL 09/72] tcg/optimize: Use finish_folding in fold_add, fold_add_vec, fold_addsub2
-Because we now store uint64_t in TCGTemp, we can now always
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 store the full 64-bit duplicate immediate.  So remove the
 difference between 32- and 64-bit hosts.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c   |  9 ++++-----
+ tcg/optimize.c | 9 +++++----
- tcg/tcg-op-vec.c | 39 ++++++++++-----------------------------
+file changed, 5 insertions(+), 4 deletions(-)
  tcg/tcg.c        |  7 +------
 files changed, 15 insertions(+), 40 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@ static void finish_ebb(OptContext *ctx)
-         case INDEX_op_dup2_vec:
+     remove_mem_copy_all(ctx);
-             assert(TCG_TARGET_REG_BITS == 32);
+ }
-             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
--                tmp = arg_info(op->args[1])->val;
+-static void finish_folding(OptContext *ctx, TCGOp *op)
--                if (tmp == arg_info(op->args[2])->val) {
++static bool finish_folding(OptContext *ctx, TCGOp *op)
--                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+ {
--                    break;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
--                }
+     int i, nb_oargs;
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
-+                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+             ts_info(ts)->z_mask = ctx->z_mask;
-+                                           arg_info(op->args[2])->val));
+         }
 +                break;
              } else if (args_are_copies(op->args[1], op->args[2])) {
                  op->opc = INDEX_op_dup_vec;
                  TCGOP_VECE(op) = MO_32;
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
      }
++    return true;
  }
--#define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
+ /*
--
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
--static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
+         fold_xi_to_x(ctx, op, 0)) {
--{
+         return true;
--    TCGTemp *rt = tcgv_vec_temp(r);
+     }
--    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
+-    return false;
--}
++    return finish_folding(ctx, op);
 -
  TCGv_vec tcg_const_zeros_vec(TCGType type)
  {
      TCGv_vec ret = tcg_temp_new_vec(type);
 -    do_dupi_vec(ret, MO_REG, 0);
 +    tcg_gen_dupi_vec(MO_64, ret, 0);
      return ret;
  }
- TCGv_vec tcg_const_ones_vec(TCGType type)
+ /* We cannot as yet do_constant_folding with vectors. */
- {
+@@ -XXX,XX +XXX,XX @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
-     TCGv_vec ret = tcg_temp_new_vec(type);
+         fold_xi_to_x(ctx, op, 0)) {
--    do_dupi_vec(ret, MO_REG, -1);
+         return true;
-+    tcg_gen_dupi_vec(MO_64, ret, -1);
+     }
-     return ret;
+-    return false;
 +    return finish_folding(ctx, op);
  }
-@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
+ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
+@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
- void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+         op->args[4] = arg_new_constant(ctx, bl);
- {
+         op->args[5] = arg_new_constant(ctx, bh);
--    if (TCG_TARGET_REG_BITS == 64) {
+     }
--        do_dupi_vec(r, MO_64, a);
+-    return false;
--    } else if (a == dup_const(MO_32, a)) {
++    return finish_folding(ctx, op);
 -        do_dupi_vec(r, MO_32, a);
 -    } else {
 -        TCGv_i64 c = tcg_const_i64(a);
 -        tcg_gen_dup_i64_vec(MO_64, r, c);
 -        tcg_temp_free_i64(c);
 -    }
 +    tcg_gen_dupi_vec(MO_64, r, a);
  }
- void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
+ static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
 -    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
 +    tcg_gen_dupi_vec(MO_32, r, a);
  }
  void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
  {
 -    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
 +    tcg_gen_dupi_vec(MO_16, r, a);
  }
  void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
  {
 -    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
 +    tcg_gen_dupi_vec(MO_8, r, a);
  }
  void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
  {
 -    if (vece == MO_64) {
 -        tcg_gen_dup64i_vec(r, a);
 -    } else {
 -        do_dupi_vec(r, MO_REG, dup_const(vece, a));
 -    }
 +    TCGTemp *rt = tcgv_vec_temp(r);
 +    tcg_gen_mov_vec(r, tcg_constant_vec(rt->base_type, vece, a));
  }
  void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
              if (tcg_can_emit_vec_op(INDEX_op_sari_vec, type, vece) > 0) {
                  tcg_gen_sari_vec(vece, t, a, (8 << vece) - 1);
              } else {
 -                do_dupi_vec(t, MO_REG, 0);
 -                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a, t);
 +                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a,
 +                                tcg_constant_vec(type, vece, 0));
              }
              tcg_gen_xor_vec(vece, r, a, t);
              tcg_gen_sub_vec(vece, r, r, t);
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
               * The targets will, in general, have to do this search anyway,
               * do this generically.
               */
 -            if (TCG_TARGET_REG_BITS == 32) {
 -                val = dup_const(MO_32, val);
 -                vece = MO_32;
 -            }
              if (val == dup_const(MO_8, val)) {
                  vece = MO_8;
              } else if (val == dup_const(MO_16, val)) {
                  vece = MO_16;
 -            } else if (TCG_TARGET_REG_BITS == 64 &&
 -                       val == dup_const(MO_32, val)) {
 +            } else if (val == dup_const(MO_32, val)) {
                  vece = MO_32;
              }
 --
-.25.1
+.43.0

-[PULL 08/24] tcg: Introduce TYPE_CONST temporaries
+[PULL 10/72] tcg/optimize: Introduce const value accessors for TempOptInfo
-These will hold a single constant for the duration of the TB.
+Introduce ti_is_const, ti_const_val, ti_is_const_val.
 They are hashed, so that each value has one temp across the TB.
 Not used yet, this is all infrastructure.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  24 ++++-
+ tcg/optimize.c | 20 +++++++++++++++++---
- tcg/optimize.c    |  13 ++-
+file changed, 17 insertions(+), 3 deletions(-)
  tcg/tcg.c         | 224 ++++++++++++++++++++++++++++++++++++----------
 files changed, 211 insertions(+), 50 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
-+++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempKind {
-     TEMP_GLOBAL,
-     /* Temp is in a fixed register. */
-     TEMP_FIXED,
-+    /* Temp is a fixed constant. */
-+    TEMP_CONST,
- } TCGTempKind;
- typedef struct TCGTemp {
-@@ -XXX,XX +XXX,XX @@ struct TCGContext {
-     QSIMPLEQ_HEAD(, TCGOp) plugin_ops;
- #endif
-+    GHashTable *const_table[TCG_TYPE_COUNT];
-     TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
-     TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
-@@ -XXX,XX +XXX,XX @@ struct TCGContext {
- static inline bool temp_readonly(TCGTemp *ts)
- {
--    return ts->kind == TEMP_FIXED;
-+    return ts->kind >= TEMP_FIXED;
- }
- extern TCGContext tcg_init_ctx;
-@@ -XXX,XX +XXX,XX @@ TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc);
- void tcg_optimize(TCGContext *s);
-+/* Allocate a new temporary and initialize it with a constant. */
- TCGv_i32 tcg_const_i32(int32_t val);
- TCGv_i64 tcg_const_i64(int64_t val);
- TCGv_i32 tcg_const_local_i32(int32_t val);
-@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec(TCGType);
- TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
- TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
-+/*
-+ * Locate or create a read-only temporary that is a constant.
-+ * This kind of temporary need not and should not be freed.
-+ */
-+TCGTemp *tcg_constant_internal(TCGType type, int64_t val);
-+
-+static inline TCGv_i32 tcg_constant_i32(int32_t val)
-+{
-+    return temp_tcgv_i32(tcg_constant_internal(TCG_TYPE_I32, val));
-+}
-+
-+static inline TCGv_i64 tcg_constant_i64(int64_t val)
-+{
-+    return temp_tcgv_i64(tcg_constant_internal(TCG_TYPE_I64, val));
-+}
-+
-+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
-+
- #if UINTPTR_MAX == UINT32_MAX
- # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
- # define tcg_const_local_ptr(x)  ((TCGv_ptr)tcg_const_local_i32((intptr_t)(x)))
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TempOptInfo *infos,
+@@ -XXX,XX +XXX,XX @@ static inline TempOptInfo *arg_info(TCGArg arg)
-         ts->state_ptr = ti;
+     return ts_info(arg_temp(arg));
          ti->next_copy = ts;
          ti->prev_copy = ts;
 -        ti->is_const = false;
 -        ti->mask = -1;
 +        if (ts->kind == TEMP_CONST) {
 +            ti->is_const = true;
 +            ti->val = ti->mask = ts->val;
 +            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
 +                /* High bits of a 32-bit quantity are garbage.  */
 +                ti->mask |= ~0xffffffffull;
 +            }
 +        } else {
 +            ti->is_const = false;
 +            ti->mask = -1;
 +        }
          set_bit(idx, temps_used->l);
      }
  }
-diff --git a/tcg/tcg.c b/tcg/tcg.c
-index XXXXXXX..XXXXXXX 100644
++static inline bool ti_is_const(TempOptInfo *ti)
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
      /* No temps have been previously allocated for size or locality.  */
      memset(s->free_temps, 0, sizeof(s->free_temps));
 +    /* No constant temps have been previously allocated. */
 +    for (int i = 0; i < TCG_TYPE_COUNT; ++i) {
 +        if (s->const_table[i]) {
 +            g_hash_table_remove_all(s->const_table[i]);
 +        }
 +    }
 +
      s->nb_ops = 0;
      s->nb_labels = 0;
      s->current_frame_offset = s->frame_start;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
      bigendian = 1;
  #endif
 -    if (base_ts->kind != TEMP_FIXED) {
 +    switch (base_ts->kind) {
 +    case TEMP_FIXED:
 +        break;
 +    case TEMP_GLOBAL:
          /* We do not support double-indirect registers.  */
          tcg_debug_assert(!base_ts->indirect_reg);
          base_ts->indirect_base = 1;
          s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64
                              ? 2 : 1);
          indirect_reg = 1;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
      TCGContext *s = tcg_ctx;
      int k, idx;
 +    /* In order to simplify users of tcg_constant_*, silently ignore free. */
 +    if (ts->kind == TEMP_CONST) {
 +        return;
 +    }
 +
  #if defined(CONFIG_DEBUG_TCG)
      s->temps_in_use--;
      if (s->temps_in_use < 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
      set_bit(idx, s->free_temps[k].l);
  }
 +TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
 +{
-+    TCGContext *s = tcg_ctx;
++    return ti->is_const;
 +    GHashTable *h = s->const_table[type];
 +    TCGTemp *ts;
 +
 +    if (h == NULL) {
 +        h = g_hash_table_new(g_int64_hash, g_int64_equal);
 +        s->const_table[type] = h;
 +    }
 +
 +    ts = g_hash_table_lookup(h, &val);
 +    if (ts == NULL) {
 +        ts = tcg_temp_alloc(s);
 +
 +        if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
 +            TCGTemp *ts2 = tcg_temp_alloc(s);
 +
 +            ts->base_type = TCG_TYPE_I64;
 +            ts->type = TCG_TYPE_I32;
 +            ts->kind = TEMP_CONST;
 +            ts->temp_allocated = 1;
 +            /*
 +             * Retain the full value of the 64-bit constant in the low
 +             * part, so that the hash table works.  Actual uses will
 +             * truncate the value to the low part.
 +             */
 +            ts->val = val;
 +
 +            tcg_debug_assert(ts2 == ts + 1);
 +            ts2->base_type = TCG_TYPE_I64;
 +            ts2->type = TCG_TYPE_I32;
 +            ts2->kind = TEMP_CONST;
 +            ts2->temp_allocated = 1;
 +            ts2->val = val >> 32;
 +        } else {
 +            ts->base_type = type;
 +            ts->type = type;
 +            ts->kind = TEMP_CONST;
 +            ts->temp_allocated = 1;
 +            ts->val = val;
 +        }
 +        g_hash_table_insert(h, &ts->val, ts);
 +    }
 +
 +    return ts;
 +}
 +
-+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
++static inline uint64_t ti_const_val(TempOptInfo *ti)
 +{
-+    val = dup_const(vece, val);
++    return ti->val;
 +    return temp_tcgv_vec(tcg_constant_internal(type, val));
 +}
 +
- TCGv_i32 tcg_const_i32(int32_t val)
++static inline bool ti_is_const_val(TempOptInfo *ti, uint64_t val)
 +{
 +    return ti_is_const(ti) && ti_const_val(ti) == val;
 +}
 +
  static inline bool ts_is_const(TCGTemp *ts)
  {
-     TCGv_i32 t0;
+-    return ts_info(ts)->is_const;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_start(TCGContext *s)
++    return ti_is_const(ts_info(ts));
          TCGTempVal val = TEMP_VAL_MEM;
          switch (ts->kind) {
 +        case TEMP_CONST:
 +            val = TEMP_VAL_CONST;
 +            break;
          case TEMP_FIXED:
              val = TEMP_VAL_REG;
              break;
@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
      case TEMP_NORMAL:
          snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
          break;
 +    case TEMP_CONST:
 +        switch (ts->type) {
 +        case TCG_TYPE_I32:
 +            snprintf(buf, buf_size, "$0x%x", (int32_t)ts->val);
 +            break;
 +#if TCG_TARGET_REG_BITS > 32
 +        case TCG_TYPE_I64:
 +            snprintf(buf, buf_size, "$0x%" PRIx64, ts->val);
 +            break;
 +#endif
 +        case TCG_TYPE_V64:
 +        case TCG_TYPE_V128:
 +        case TCG_TYPE_V256:
 +            snprintf(buf, buf_size, "v%d$0x%" PRIx64,
 +                     64 << (ts->type - TCG_TYPE_V64), ts->val);
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
 +        break;
      }
      return buf;
  }
-@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
-             state = TS_DEAD | TS_MEM;
+ static inline bool ts_is_const_val(TCGTemp *ts, uint64_t val)
              break;
          case TEMP_NORMAL:
 +        case TEMP_CONST:
              state = TS_DEAD;
              break;
          default:
@@ -XXX,XX +XXX,XX @@ static void la_bb_sync(TCGContext *s, int ng, int nt)
      la_global_sync(s, ng);
      for (int i = ng; i < nt; ++i) {
 -        if (s->temps[i].kind == TEMP_LOCAL) {
 -            int state = s->temps[i].state;
 -            s->temps[i].state = state | TS_MEM;
 +        TCGTemp *ts = &s->temps[i];
 +        int state;
 +
 +        switch (ts->kind) {
 +        case TEMP_LOCAL:
 +            state = ts->state;
 +            ts->state = state | TS_MEM;
              if (state != TS_DEAD) {
                  continue;
              }
 -        } else {
 +            break;
 +        case TEMP_NORMAL:
              s->temps[i].state = TS_DEAD;
 +            break;
 +        case TEMP_CONST:
 +            continue;
 +        default:
 +            g_assert_not_reached();
          }
          la_reset_pref(&s->temps[i]);
      }
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
     mark it free; otherwise mark it dead.  */
  static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
  {
--    if (temp_readonly(ts)) {
+-    TempOptInfo *ti = ts_info(ts);
-+    TCGTempVal new_type;
+-    return ti->is_const && ti->val == val;
-+
++    return ti_is_const_val(ts_info(ts), val);
 +    switch (ts->kind) {
 +    case TEMP_FIXED:
          return;
 +    case TEMP_GLOBAL:
 +    case TEMP_LOCAL:
 +        new_type = TEMP_VAL_MEM;
 +        break;
 +    case TEMP_NORMAL:
 +        new_type = free_or_dead < 0 ? TEMP_VAL_MEM : TEMP_VAL_DEAD;
 +        break;
 +    case TEMP_CONST:
 +        new_type = TEMP_VAL_CONST;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      if (ts->val_type == TEMP_VAL_REG) {
          s->reg_to_temp[ts->reg] = NULL;
      }
 -    ts->val_type = (free_or_dead < 0
 -                    || ts->kind != TEMP_NORMAL
 -                    ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
 +    ts->val_type = new_type;
  }
- /* Mark a temporary as dead.  */
+ static inline bool arg_is_const(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
  static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                        TCGRegSet preferred_regs, int free_or_dead)
  {
 -    if (temp_readonly(ts)) {
 -        return;
 -    }
 -    if (!ts->mem_coherent) {
 +    if (!temp_readonly(ts) && !ts->mem_coherent) {
          if (!ts->mem_allocated) {
              temp_allocate_frame(s, ts);
          }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
      for (i = s->nb_globals; i < s->nb_temps; i++) {
          TCGTemp *ts = &s->temps[i];
 -        if (ts->kind == TEMP_LOCAL) {
 +
 +        switch (ts->kind) {
 +        case TEMP_LOCAL:
              temp_save(s, ts, allocated_regs);
 -        } else {
 +            break;
 +        case TEMP_NORMAL:
              /* The liveness analysis already ensures that temps are dead.
                 Keep an tcg_debug_assert for safety. */
              tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
 +            break;
 +        case TEMP_CONST:
 +            /* Similarly, we should have freed any allocated register. */
 +            tcg_debug_assert(ts->val_type == TEMP_VAL_CONST);
 +            break;
 +        default:
 +            g_assert_not_reached();
          }
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
           * The liveness analysis already ensures that temps are dead.
           * Keep tcg_debug_asserts for safety.
           */
 -        if (ts->kind == TEMP_LOCAL) {
 +        switch (ts->kind) {
 +        case TEMP_LOCAL:
              tcg_debug_assert(ts->val_type != TEMP_VAL_REG || ts->mem_coherent);
 -        } else {
 +            break;
 +        case TEMP_NORMAL:
              tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
 +            break;
 +        case TEMP_CONST:
 +            break;
 +        default:
 +            g_assert_not_reached();
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          i_preferred_regs = o_preferred_regs = 0;
          if (arg_ct->ialias) {
              o_preferred_regs = op->output_pref[arg_ct->alias_index];
 -            if (ts->kind == TEMP_FIXED) {
 -                /* if fixed register, we must allocate a new register
 -                   if the alias is not the same register */
 -                if (arg != op->args[arg_ct->alias_index]) {
 -                    goto allocate_in_reg;
 -                }
 -            } else {
 -                /* if the input is aliased to an output and if it is
 -                   not dead after the instruction, we must allocate
 -                   a new register and move it */
 -                if (!IS_DEAD_ARG(i)) {
 -                    goto allocate_in_reg;
 -                }
 -                /* check if the current register has already been allocated
 -                   for another input aliased to an output */
 -                if (ts->val_type == TEMP_VAL_REG) {
 -                    int k2, i2;
 -                    reg = ts->reg;
 -                    for (k2 = 0 ; k2 < k ; k2++) {
 -                        i2 = def->args_ct[nb_oargs + k2].sort_index;
 -                        if (def->args_ct[i2].ialias && reg == new_args[i2]) {
 -                            goto allocate_in_reg;
 -                        }
 +            /*
 +             * If the input is readonly, then it cannot also be an
 +             * output and aliased to itself.  If the input is not
 +             * dead after the instruction, we must allocate a new
 +             * register and move it.
 +             */
 +            if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
 +                goto allocate_in_reg;
 +            }
 +
 +            /*
 +             * Check if the current register has already been allocated
 +             * for another input aliased to an output.
 +             */
 +            if (ts->val_type == TEMP_VAL_REG) {
 +                reg = ts->reg;
 +                for (int k2 = 0; k2 < k; k2++) {
 +                    int i2 = def->args_ct[nb_oargs + k2].sort_index;
 +                    if (def->args_ct[i2].ialias && reg == new_args[i2]) {
 +                        goto allocate_in_reg;
                      }
                  }
 -                i_preferred_regs = o_preferred_regs;
              }
 +            i_preferred_regs = o_preferred_regs;
          }
          temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
          reg = ts->reg;
 -        if (tcg_regset_test_reg(arg_ct->regs, reg)) {
 -            /* nothing to do : the constraint is satisfied */
 -        } else {
 -        allocate_in_reg:
 -            /* allocate a new register matching the constraint
 -               and move the temporary register into it */
 +        if (!tcg_regset_test_reg(arg_ct->regs, reg)) {
 + allocate_in_reg:
 +            /*
 +             * Allocate a new register matching the constraint
 +             * and move the temporary register into it.
 +             */
              temp_load(s, ts, tcg_target_available_regs[ts->type],
                        i_allocated_regs, 0);
              reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
 --
-.25.1
+.43.0

-New patch
+[PULL 11/72] tcg/optimize: Use fold_masks_zs in fold_and
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Sink mask computation below fold_affected_mask early exit.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 30 ++++++++++++++++--------------
+file changed, 16 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z1, z2;
++    uint64_t z1, z2, z_mask, s_mask;
++    TempOptInfo *t1, *t2;
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_i(ctx, op, 0) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+         return true;
+     }
+-    z1 = arg_info(op->args[1])->z_mask;
+-    z2 = arg_info(op->args[2])->z_mask;
+-    ctx->z_mask = z1 & z2;
+-
+-    /*
+-     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+-     * Bitwise operations preserve the relative quantity of the repetitions.
+-     */
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
++    t1 = arg_info(op->args[1]);
++    t2 = arg_info(op->args[2]);
++    z1 = t1->z_mask;
++    z2 = t2->z_mask;
+     /*
+      * Known-zeros does not imply known-ones.  Therefore unless
+      * arg2 is constant, we can't infer affected bits from it.
+      */
+-    if (arg_is_const(op->args[2]) &&
+-        fold_affected_mask(ctx, op, z1 & ~z2)) {
++    if (ti_is_const(t2) && fold_affected_mask(ctx, op, z1 & ~z2)) {
+         return true;
+     }
+-    return fold_masks(ctx, op);
++    z_mask = z1 & z2;
++
++    /*
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
++     * Bitwise operations preserve the relative quantity of the repetitions.
++     */
++    s_mask = t1->s_mask & t2->s_mask;
++
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 12/72] tcg/optimize: Use fold_masks_zs in fold_andc
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Avoid double inversion of the value of second const operand.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 21 +++++++++++----------
+file changed, 11 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z1;
++    uint64_t z_mask, s_mask;
++    TempOptInfo *t1, *t2;
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+         return true;
+     }
+-    z1 = arg_info(op->args[1])->z_mask;
++    t1 = arg_info(op->args[1]);
++    t2 = arg_info(op->args[2]);
++    z_mask = t1->z_mask;
+     /*
+      * Known-zeros does not imply known-ones.  Therefore unless
+      * arg2 is constant, we can't infer anything from it.
+      */
+-    if (arg_is_const(op->args[2])) {
+-        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+-        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
++    if (ti_is_const(t2)) {
++        uint64_t v2 = ti_const_val(t2);
++        if (fold_affected_mask(ctx, op, z_mask & v2)) {
+             return true;
+         }
+-        z1 &= z2;
++        z_mask &= ~v2;
+     }
+-    ctx->z_mask = z1;
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return fold_masks(ctx, op);
++    s_mask = t1->s_mask & t2->s_mask;
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 13/72] tcg/optimize: Use fold_masks_zs in fold_bswap
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Always set s_mask along the BSWAP_OS path, since the result is
+being explicitly sign-extended.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 21 ++++++++++-----------
+file changed, 10 insertions(+), 11 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask, s_mask, sign;
++    TempOptInfo *t1 = arg_info(op->args[1]);
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t = arg_info(op->args[1])->val;
+-
+-        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    if (ti_is_const(t1)) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
++                                do_constant_folding(op->opc, ctx->type,
++                                                    ti_const_val(t1),
++                                                    op->args[2]));
+     }
+-    z_mask = arg_info(op->args[1])->z_mask;
+-
++    z_mask = t1->z_mask;
+     switch (op->opc) {
+     case INDEX_op_bswap16_i32:
+     case INDEX_op_bswap16_i64:
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+         /* If the sign bit may be 1, force all the bits above to 1. */
+         if (z_mask & sign) {
+             z_mask |= sign;
+-            s_mask = sign << 1;
+         }
++        /* The value and therefore s_mask is explicitly sign-extended. */
++        s_mask = sign;
+         break;
+     default:
+         /* The high bits are undefined: force all bits above the sign to 1. */
+         z_mask |= sign << 1;
+         break;
+     }
+-    ctx->z_mask = z_mask;
+-    ctx->s_mask = s_mask;
+-    return fold_masks(ctx, op);
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 14/72] tcg/optimize: Use fold_masks_zs in fold_count_zeros
+Avoid the use of the OptContext slots. Find TempOptInfo once.
+Compute s_mask from the union of the maximum count and the
+op2 fallback for op1 being zero.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 15 ++++++++++-----
+file changed, 10 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z_mask;
++    uint64_t z_mask, s_mask;
++    TempOptInfo *t1 = arg_info(op->args[1]);
++    TempOptInfo *t2 = arg_info(op->args[2]);
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t = arg_info(op->args[1])->val;
++    if (ti_is_const(t1)) {
++        uint64_t t = ti_const_val(t1);
+         if (t != 0) {
+             t = do_constant_folding(op->opc, ctx->type, t, 0);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+     default:
+         g_assert_not_reached();
+     }
+-    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+-    return false;
++    s_mask = ~z_mask;
++    z_mask |= t2->z_mask;
++    s_mask &= t2->s_mask;
++
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 02/24] tcg: Increase tcg_out_dupi_vec immediate to int64_t
+[PULL 15/72] tcg/optimize: Use fold_masks_z in fold_ctpop
-While we don't store more than tcg_target_long in TCGTemp,
+Add fold_masks_z as a trivial wrapper around fold_masks_zs.
-we shouldn't be limited to that for code generation.  We will
+Avoid the use of the OptContext slots.
 be able to use this for INDEX_op_dup2_vec with 2 constants.
-Also pass along the minimal vece that may be said to apply
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 to the constant.  This allows some simplification in the
 various backends.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c                    | 31 +++++++++++++++++++++++++-----
+ tcg/optimize.c | 13 ++++++++++---
- tcg/aarch64/tcg-target.c.inc | 12 ++++++------
+file changed, 10 insertions(+), 3 deletions(-)
  tcg/i386/tcg-target.c.inc    | 22 ++++++++++++---------
  tcg/ppc/tcg-target.c.inc     | 37 +++++++++++++++++++++++-------------
 files changed, 69 insertions(+), 33 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
                              TCGReg dst, TCGReg src);
  static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                               TCGReg dst, TCGReg base, intptr_t offset);
 -static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 -                             TCGReg dst, tcg_target_long arg);
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg dst, int64_t arg);
  static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
                             unsigned vece, const TCGArg *args,
                             const int *const_args);
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
  {
      g_assert_not_reached();
  }
 -static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 -                                    TCGReg dst, tcg_target_long arg)
 +static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                                    TCGReg dst, int64_t arg)
  {
      g_assert_not_reached();
  }
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
          if (ts->type <= TCG_TYPE_I64) {
              tcg_out_movi(s, ts->type, reg, ts->val);
          } else {
 -            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
 +            uint64_t val = ts->val;
 +            MemOp vece = MO_64;
 +
 +            /*
 +             * Find the minimal vector element that matches the constant.
 +             * The targets will, in general, have to do this search anyway,
 +             * do this generically.
 +             */
 +            if (TCG_TARGET_REG_BITS == 32) {
 +                val = dup_const(MO_32, val);
 +                vece = MO_32;
 +            }
 +            if (val == dup_const(MO_8, val)) {
 +                vece = MO_8;
 +            } else if (val == dup_const(MO_16, val)) {
 +                vece = MO_16;
 +            } else if (TCG_TARGET_REG_BITS == 64 &&
 +                       val == dup_const(MO_32, val)) {
 +                vece = MO_32;
 +            }
 +
 +            tcg_out_dupi_vec(s, ts->type, vece, reg, ts->val);
          }
          ts->mem_coherent = 0;
          break;
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
      tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
  }
 -static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 -                             TCGReg rd, tcg_target_long v64)
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg rd, int64_t v64)
  {
      bool q = type == TCG_TYPE_V128;
      int cmode, imm8, i;
      /* Test all bytes equal first.  */
 -    if (v64 == dup_const(MO_8, v64)) {
 +    if (vece == MO_8) {
          imm8 = (uint8_t)v64;
          tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
          return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
       * cannot find an expansion there's no point checking a larger
       * width because we already know by replication it cannot match.
       */
 -    if (v64 == dup_const(MO_16, v64)) {
 +    if (vece == MO_16) {
          uint16_t v16 = v64;
          if (is_shimm16(v16, &cmode, &imm8)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
          tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
          tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
          return;
 -    } else if (v64 == dup_const(MO_32, v64)) {
 +    } else if (vece == MO_32) {
          uint32_t v32 = v64;
          uint32_t n32 = ~v32;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                          tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
                          break;
                      }
 -                    tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0);
 +                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                      a2 = TCG_VEC_TMP;
                  }
                  insn = cmp_insn[cond];
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
      return true;
  }
--static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
++static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
--                             TCGReg ret, tcg_target_long arg)
++{
-+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
++    return fold_masks_zs(ctx, op, z_mask, 0);
-+                             TCGReg ret, int64_t arg)
++}
 +
  static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
-     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
+     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-         return;
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
--    if (TCG_TARGET_REG_BITS == 64) {
+     switch (ctx->type) {
-+    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
+     case TCG_TYPE_I32:
-+        if (have_avx2) {
+-        ctx->z_mask = 32 | 31;
-+            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
++        z_mask = 32 | 31;
-+        } else {
+         break;
-+            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
+     case TCG_TYPE_I64:
-+        }
+-        ctx->z_mask = 64 | 63;
-+        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
++        z_mask = 64 | 63;
-+    } else {
+         break;
-         if (type == TCG_TYPE_V64) {
+     default:
-             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
+         g_assert_not_reached();
          } else if (have_avx2) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
          } else {
              tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
          }
 -        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 -    } else {
 -        if (have_avx2) {
 -            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 +        if (TCG_TARGET_REG_BITS == 64) {
 +            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
          } else {
 -            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 +            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
          }
 -        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
      }
+-    return false;
++    return fold_masks_z(ctx, op, z_mask);
  }
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+ static bool fold_deposit(OptContext *ctx, TCGOp *op)
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
      }
  }
 -static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
 -                             tcg_target_long val)
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg ret, int64_t val)
  {
      uint32_t load_insn;
      int rel, low;
      intptr_t add;
 -    low = (int8_t)val;
 -    if (low >= -16 && low < 16) {
 -        if (val == (tcg_target_long)dup_const(MO_8, low)) {
 +    switch (vece) {
 +    case MO_8:
 +        low = (int8_t)val;
 +        if (low >= -16 && low < 16) {
              tcg_out32(s, VSPLTISB | VRT(ret) | ((val & 31) << 16));
              return;
          }
 -        if (val == (tcg_target_long)dup_const(MO_16, low)) {
 +        if (have_isa_3_00) {
 +            tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
 +            return;
 +        }
 +        break;
 +
 +    case MO_16:
 +        low = (int16_t)val;
 +        if (low >= -16 && low < 16) {
              tcg_out32(s, VSPLTISH | VRT(ret) | ((val & 31) << 16));
              return;
          }
 -        if (val == (tcg_target_long)dup_const(MO_32, low)) {
 +        break;
 +
 +    case MO_32:
 +        low = (int32_t)val;
 +        if (low >= -16 && low < 16) {
              tcg_out32(s, VSPLTISW | VRT(ret) | ((val & 31) << 16));
              return;
          }
 -    }
 -    if (have_isa_3_00 && val == (tcg_target_long)dup_const(MO_8, val)) {
 -        tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
 -        return;
 +        break;
      }
      /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
          if (TCG_TARGET_REG_BITS == 64) {
              new_pool_label(s, val, rel, s->code_ptr, add);
          } else {
 -            new_pool_l2(s, rel, s->code_ptr, add, val, val);
 +            new_pool_l2(s, rel, s->code_ptr, add, val >> 32, val);
          }
      } else {
          load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
          if (TCG_TARGET_REG_BITS == 64) {
              new_pool_l2(s, rel, s->code_ptr, add, val, val);
          } else {
 -            new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
 +            new_pool_l4(s, rel, s->code_ptr, add,
 +                        val >> 32, val, val >> 32, val);
          }
      }
 --
-.25.1
+.43.0

-New patch
+[PULL 16/72] tcg/optimize: Use fold_and and fold_masks_z in fold_deposit
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+When we fold to and, use fold_and.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 35 +++++++++++++++++------------------
+file changed, 17 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+ {
++    TempOptInfo *t1 = arg_info(op->args[1]);
++    TempOptInfo *t2 = arg_info(op->args[2]);
++    int ofs = op->args[3];
++    int len = op->args[4];
+     TCGOpcode and_opc;
++    uint64_t z_mask;
+-    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-        uint64_t t1 = arg_info(op->args[1])->val;
+-        uint64_t t2 = arg_info(op->args[2])->val;
+-
+-        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
++    if (ti_is_const(t1) && ti_is_const(t2)) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
++                                deposit64(ti_const_val(t1), ofs, len,
++                                          ti_const_val(t2)));
+     }
+     switch (ctx->type) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+     }
+     /* Inserting a value into zero at offset 0. */
+-    if (arg_is_const_val(op->args[1], 0) && op->args[3] == 0) {
+-        uint64_t mask = MAKE_64BIT_MASK(0, op->args[4]);
++    if (ti_is_const_val(t1, 0) && ofs == 0) {
++        uint64_t mask = MAKE_64BIT_MASK(0, len);
+         op->opc = and_opc;
+         op->args[1] = op->args[2];
+         op->args[2] = arg_new_constant(ctx, mask);
+-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
+-        return false;
++        return fold_and(ctx, op);
+     }
+     /* Inserting zero into a value. */
+-    if (arg_is_const_val(op->args[2], 0)) {
+-        uint64_t mask = deposit64(-1, op->args[3], op->args[4], 0);
++    if (ti_is_const_val(t2, 0)) {
++        uint64_t mask = deposit64(-1, ofs, len, 0);
+         op->opc = and_opc;
+         op->args[2] = arg_new_constant(ctx, mask);
+-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
+-        return false;
++        return fold_and(ctx, op);
+     }
+-    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+-                            op->args[3], op->args[4],
+-                            arg_info(op->args[2])->z_mask);
+-    return false;
++    z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 22/24] tcg/ppc: Use tcg_constant_vec with tcg vec expanders
+[PULL 17/72] tcg/optimize: Compute sign mask in fold_deposit
-Improve expand_vec_shi to use sign-extraction for MO_32.
+The input which overlaps the sign bit of the output can
-This allows a single VSPLTISB instruction to load all of
+have its input s_mask propagated to the output s_mask.
 the valid shift constants.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/ppc/tcg-target.c.inc | 44 ++++++++++++++++++++++++----------------
+ tcg/optimize.c | 14 ++++++++++++--
-file changed, 27 insertions(+), 17 deletions(-)
+file changed, 12 insertions(+), 2 deletions(-)
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/ppc/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/ppc/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
- static void expand_vec_shi(TCGType type, unsigned vece, TCGv_vec v0,
+     TempOptInfo *t2 = arg_info(op->args[2]);
-                            TCGv_vec v1, TCGArg imm, TCGOpcode opci)
+     int ofs = op->args[3];
- {
+     int len = op->args[4];
--    TCGv_vec t1 = tcg_temp_new_vec(type);
++    int width;
-+    TCGv_vec t1;
+     TCGOpcode and_opc;
+-    uint64_t z_mask;
--    /* Splat w/bytes for xxspltib.  */
++    uint64_t z_mask, s_mask;
--    tcg_gen_dupi_vec(MO_8, t1, imm & ((8 << vece) - 1));
-+    if (vece == MO_32) {
+     if (ti_is_const(t1) && ti_is_const(t2)) {
-+        /*
+         return tcg_opt_gen_movi(ctx, op, op->args[0],
-+         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
-+         * So using negative numbers gets us the 4th bit easily.
+     switch (ctx->type) {
-+         */
+     case TCG_TYPE_I32:
-+        imm = sextract32(imm, 0, 5);
+         and_opc = INDEX_op_and_i32;
 +        width = 32;
          break;
      case TCG_TYPE_I64:
          and_opc = INDEX_op_and_i64;
 +        width = 64;
          break;
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          return fold_and(ctx, op);
      }
 +    /* The s_mask from the top portion of the deposit is still valid. */
 +    if (ofs + len == width) {
 +        s_mask = t2->s_mask << ofs;
 +    } else {
-+        imm &= (8 << vece) - 1;
++        s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len);
 +    }
 +
-+    /* Splat w/bytes for xxspltib when 2.07 allows MO_64. */
+     z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
-+    t1 = tcg_constant_vec(type, MO_8, imm);
+-    return fold_masks_z(ctx, op, z_mask);
-     vec_gen_3(opci, type, vece, tcgv_vec_arg(v0),
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
                tcgv_vec_arg(v1), tcgv_vec_arg(t1));
 -    tcg_temp_free_vec(t1);
  }
- static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
  {
      TCGv_vec t1 = tcg_temp_new_vec(type);
      TCGv_vec t2 = tcg_temp_new_vec(type);
 -    TCGv_vec t3, t4;
 +    TCGv_vec c0, c16;
      switch (vece) {
      case MO_8:
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
      case MO_32:
          tcg_debug_assert(!have_isa_2_07);
 -        t3 = tcg_temp_new_vec(type);
 -        t4 = tcg_temp_new_vec(type);
 -        tcg_gen_dupi_vec(MO_8, t4, -16);
 +        /*
 +         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
 +         * So using -16 is a quick way to represent 16.
 +         */
 +        c16 = tcg_constant_vec(type, MO_8, -16);
 +        c0 = tcg_constant_vec(type, MO_8, 0);
 +
          vec_gen_3(INDEX_op_rotlv_vec, type, MO_32, tcgv_vec_arg(t1),
 -                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
 +                  tcgv_vec_arg(v2), tcgv_vec_arg(c16));
          vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
                    tcgv_vec_arg(v1), tcgv_vec_arg(v2));
 -        tcg_gen_dupi_vec(MO_8, t3, 0);
 -        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
 -                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
 -        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
 -                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
 -        tcg_gen_add_vec(MO_32, v0, t2, t3);
 -        tcg_temp_free_vec(t3);
 -        tcg_temp_free_vec(t4);
 +        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t1),
 +                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(c0));
 +        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t1),
 +                  tcgv_vec_arg(t1), tcgv_vec_arg(c16));
 +        tcg_gen_add_vec(MO_32, v0, t1, t2);
          break;
      default:
 --
-.25.1
+.43.0

-New patch
+[PULL 18/72] tcg/optimize: Use finish_folding in fold_divide
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
+         fold_xi_to_x(ctx, op, 1)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 19/72] tcg/optimize: Use finish_folding in fold_dup, fold_dup2
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup(OptContext *ctx, TCGOp *op)
+         t = dup_const(TCGOP_VECE(op), t);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_dup2(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
+         op->opc = INDEX_op_dup_vec;
+         TCGOP_VECE(op) = MO_32;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 04/24] tcg: Add temp_readonly
+[PULL 20/72] tcg/optimize: Use fold_masks_s in fold_eqv
-In most, but not all, places that we check for TEMP_FIXED,
+Add fold_masks_s as a trivial wrapper around fold_masks_zs.
-we are really testing that we do not modify the temporary.
+Avoid the use of the OptContext slots.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  5 +++++
+ tcg/optimize.c | 13 ++++++++++---
- tcg/tcg.c         | 21 ++++++++++-----------
+file changed, 10 insertions(+), 3 deletions(-)
 files changed, 15 insertions(+), 11 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct TCGContext {
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
-     target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
+     return fold_masks_zs(ctx, op, z_mask, 0);
- };
+ }
-+static inline bool temp_readonly(TCGTemp *ts)
++static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
 +{
-+    return ts->kind == TEMP_FIXED;
++    return fold_masks_zs(ctx, op, -1, s_mask);
 +}
 +
- extern TCGContext tcg_init_ctx;
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
  extern __thread TCGContext *tcg_ctx;
  extern const void *tcg_code_gen_epilogue;
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
     mark it free; otherwise mark it dead.  */
  static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
  {
--    if (ts->kind == TEMP_FIXED) {
+     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
-+    if (temp_readonly(ts)) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
-         return;
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t s_mask;
 +
      if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
-     if (ts->val_type == TEMP_VAL_REG) {
-@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
- static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
+-                & arg_info(op->args[2])->s_mask;
-                       TCGRegSet preferred_regs, int free_or_dead)
+-    return false;
- {
++    s_mask = arg_info(op->args[1])->s_mask
--    if (ts->kind == TEMP_FIXED) {
++           & arg_info(op->args[2])->s_mask;
-+    if (temp_readonly(ts)) {
++    return fold_masks_s(ctx, op, s_mask);
          return;
      }
      if (!ts->mem_coherent) {
@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
  {
      /* The liveness analysis already ensures that globals are back
         in memory. Keep an tcg_debug_assert for safety. */
 -    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
 -                     || ts->kind == TEMP_FIXED);
 +    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || temp_readonly(ts));
  }
- /* save globals to their canonical location and assume they can be
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                    TCGRegSet preferred_regs)
  {
      /* ENV should not be modified.  */
 -    tcg_debug_assert(ots->kind != TEMP_FIXED);
 +    tcg_debug_assert(!temp_readonly(ots));
      /* The movi is not explicitly generated here.  */
      if (ots->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
      ts = arg_temp(op->args[1]);
      /* ENV should not be modified.  */
 -    tcg_debug_assert(ots->kind != TEMP_FIXED);
 +    tcg_debug_assert(!temp_readonly(ots));
      /* Note that otype != itype for no-op truncation.  */
      otype = ots->type;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                   * Store the source register into the destination slot
                   * and leave the destination temp as TEMP_VAL_MEM.
                   */
 -                assert(ots->kind != TEMP_FIXED);
 +                assert(!temp_readonly(ots));
                  if (!ts->mem_allocated) {
                      temp_allocate_frame(s, ots);
                  }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
      its = arg_temp(op->args[1]);
      /* ENV should not be modified.  */
 -    tcg_debug_assert(ots->kind != TEMP_FIXED);
 +    tcg_debug_assert(!temp_readonly(ots));
      itype = its->type;
      vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
              ts = arg_temp(arg);
              /* ENV should not be modified.  */
 -            tcg_debug_assert(ts->kind != TEMP_FIXED);
 +            tcg_debug_assert(!temp_readonly(ts));
              if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                  reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          ts = arg_temp(op->args[i]);
          /* ENV should not be modified.  */
 -        tcg_debug_assert(ts->kind != TEMP_FIXED);
 +        tcg_debug_assert(!temp_readonly(ts));
          if (NEED_SYNC_ARG(i)) {
              temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
          ts = arg_temp(arg);
          /* ENV should not be modified.  */
 -        tcg_debug_assert(ts->kind != TEMP_FIXED);
 +        tcg_debug_assert(!temp_readonly(ts));
          reg = tcg_target_call_oarg_regs[i];
          tcg_debug_assert(s->reg_to_temp[reg] == NULL);
 --
-.25.1
+.43.0

-New patch
+[PULL 21/72] tcg/optimize: Use fold_masks_z in fold_extract
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 15 ++++++---------
+file changed, 6 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask_old, z_mask;
++    TempOptInfo *t1 = arg_info(op->args[1]);
+     int pos = op->args[2];
+     int len = op->args[3];
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t;
+-
+-        t = arg_info(op->args[1])->val;
+-        t = extract64(t, pos, len);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    if (ti_is_const(t1)) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
++                                extract64(ti_const_val(t1), pos, len));
+     }
+-    z_mask_old = arg_info(op->args[1])->z_mask;
++    z_mask_old = t1->z_mask;
+     z_mask = extract64(z_mask_old, pos, len);
+     if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+         return true;
+     }
+-    ctx->z_mask = z_mask;
+-    return fold_masks(ctx, op);
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 22/72] tcg/optimize: Use finish_folding in fold_extract2
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+         }
+         return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 23/72] tcg/optimize: Use fold_masks_zs in fold_exts
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Explicitly sign-extend z_mask instead of doing that manually.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 29 ++++++++++++-----------------
+file changed, 12 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t s_mask_old, s_mask, z_mask, sign;
++    uint64_t s_mask_old, s_mask, z_mask;
+     bool type_change = false;
++    TempOptInfo *t1;
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+-    z_mask = arg_info(op->args[1])->z_mask;
+-    s_mask = arg_info(op->args[1])->s_mask;
++    t1 = arg_info(op->args[1]);
++    z_mask = t1->z_mask;
++    s_mask = t1->s_mask;
+     s_mask_old = s_mask;
+     switch (op->opc) {
+     CASE_OP_32_64(ext8s):
+-        sign = INT8_MIN;
+-        z_mask = (uint8_t)z_mask;
++        s_mask |= INT8_MIN;
++        z_mask = (int8_t)z_mask;
+         break;
+     CASE_OP_32_64(ext16s):
+-        sign = INT16_MIN;
+-        z_mask = (uint16_t)z_mask;
++        s_mask |= INT16_MIN;
++        z_mask = (int16_t)z_mask;
+         break;
+     case INDEX_op_ext_i32_i64:
+         type_change = true;
+         QEMU_FALLTHROUGH;
+     case INDEX_op_ext32s_i64:
+-        sign = INT32_MIN;
+-        z_mask = (uint32_t)z_mask;
++        s_mask |= INT32_MIN;
++        z_mask = (int32_t)z_mask;
+         break;
+     default:
+         g_assert_not_reached();
+     }
+-    if (z_mask & sign) {
+-        z_mask |= sign;
+-    }
+-    s_mask |= sign << 1;
+-
+-    ctx->z_mask = z_mask;
+-    ctx->s_mask = s_mask;
+     if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+         return true;
+     }
+-    return fold_masks(ctx, op);
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_extu(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 24/72] tcg/optimize: Use fold_masks_z in fold_extu
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 4 ++--
+file changed, 2 insertions(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
+         g_assert_not_reached();
+     }
+-    ctx->z_mask = z_mask;
+     if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+         return true;
+     }
+-    return fold_masks(ctx, op);
++
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_mb(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 09/24] tcg/optimize: Improve find_better_copy
+[PULL 25/72] tcg/optimize: Use fold_masks_zs in fold_movcond
-Prefer TEMP_CONST over anything else.
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 27 ++++++++++++---------------
+ tcg/optimize.c | 19 +++++++++++--------
-file changed, 12 insertions(+), 15 deletions(-)
+file changed, 11 insertions(+), 8 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void init_arg_info(TempOptInfo *infos,
+@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
- static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
--    TCGTemp *i;
++    uint64_t z_mask, s_mask;
-+    TCGTemp *i, *g, *l;
++    TempOptInfo *tt, *ft;
+     int i;
--    /* If this is already a global, we can't do better. */
--    if (ts->kind >= TEMP_GLOBAL) {
+     /* If true and false values are the same, eliminate the cmp. */
-+    /* If this is already readonly, we can't do better. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-+    if (temp_readonly(ts)) {
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
          return ts;
      }
--    /* Search for a global first. */
+-    ctx->z_mask = arg_info(op->args[3])->z_mask
-+    g = l = NULL;
+-                | arg_info(op->args[4])->z_mask;
-     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
+-    ctx->s_mask = arg_info(op->args[3])->s_mask
--        if (i->kind >= TEMP_GLOBAL) {
+-                & arg_info(op->args[4])->s_mask;
-+        if (temp_readonly(i)) {
++    tt = arg_info(op->args[3]);
-             return i;
++    ft = arg_info(op->args[4]);
--        }
++    z_mask = tt->z_mask | ft->z_mask;
--    }
++    s_mask = tt->s_mask & ft->s_mask;
--
--    /* If it is a temp, search for a temp local. */
+-    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
--    if (ts->kind == TEMP_NORMAL) {
+-        uint64_t tv = arg_info(op->args[3])->val;
--        for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
+-        uint64_t fv = arg_info(op->args[4])->val;
--            if (i->kind >= TEMP_LOCAL) {
++    if (ti_is_const(tt) && ti_is_const(ft)) {
--                return i;
++        uint64_t tv = ti_const_val(tt);
-+        } else if (i->kind > ts->kind) {
++        uint64_t fv = ti_const_val(ft);
-+            if (i->kind == TEMP_GLOBAL) {
+         TCGOpcode opc, negopc = 0;
-+                g = i;
+         TCGCond cond = op->args[5];
-+            } else if (i->kind == TEMP_LOCAL) {
-+                l = i;
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
              }
          }
      }
+-    return false;
--    /* Failure to find a better representation, return the same temp. */
++
--    return ts;
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
 +    /* If we didn't find a better representation, return the same temp. */
 +    return g ? g : l ? l : ts;
  }
- static bool ts_are_copies(TCGTemp *ts1, TCGTemp *ts2)
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
 --
-.25.1
+.43.0

-New patch
+[PULL 26/72] tcg/optimize: Use finish_folding in fold_mul*
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 6 +++---
+file changed, 3 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
+         fold_xi_to_x(ctx, op, 1)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+         fold_xi_to_i(ctx, op, 0)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+         tcg_opt_gen_movi(ctx, op2, rh, h);
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 27/72] tcg/optimize: Use fold_masks_s in fold_nand
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t s_mask;
++
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_not(ctx, op, -1)) {
+         return true;
+     }
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
++           & arg_info(op->args[2])->s_mask;
++    return fold_masks_s(ctx, op, s_mask);
+ }
+ static bool fold_neg_no_const(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 28/72] tcg/optimize: Use fold_masks_z in fold_neg_no_const
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++-------
+file changed, 2 insertions(+), 7 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_neg_no_const(OptContext *ctx, TCGOp *op)
+ {
+     /* Set to 1 all bits to the left of the rightmost.  */
+     uint64_t z_mask = arg_info(op->args[1])->z_mask;
+-    ctx->z_mask = -(z_mask & -z_mask);
++    z_mask = -(z_mask & -z_mask);
+-    /*
+-     * Because of fold_sub_to_neg, we want to always return true,
+-     * via finish_folding.
+-     */
+-    finish_folding(ctx, op);
+-    return true;
++    return fold_masks_z(ctx, op, z_mask);
+ }
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 29/72] tcg/optimize: Use fold_masks_s in fold_nor
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t s_mask;
++
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_not(ctx, op, 0)) {
+         return true;
+     }
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
++           & arg_info(op->args[2])->s_mask;
++    return fold_masks_s(ctx, op, s_mask);
+ }
+ static bool fold_not(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 30/72] tcg/optimize: Use fold_masks_s in fold_not
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 7 +------
+file changed, 1 insertion(+), 6 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+-
+-    ctx->s_mask = arg_info(op->args[1])->s_mask;
+-
+-    /* Because of fold_to_not, we want to always return true, via finish. */
+-    finish_folding(ctx, op);
+-    return true;
++    return fold_masks_s(ctx, op, arg_info(op->args[1])->s_mask);
+ }
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 31/72] tcg/optimize: Use fold_masks_zs in fold_or
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 13 ++++++++-----
+file changed, 8 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask, s_mask;
++    TempOptInfo *t1, *t2;
++
+     if (fold_const2_commutative(ctx, op) ||
+         fold_xi_to_x(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+-    ctx->z_mask = arg_info(op->args[1])->z_mask
+-                | arg_info(op->args[2])->z_mask;
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return fold_masks(ctx, op);
++    t1 = arg_info(op->args[1]);
++    t2 = arg_info(op->args[2]);
++    z_mask = t1->z_mask | t2->z_mask;
++    s_mask = t1->s_mask & t2->s_mask;
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 32/72] tcg/optimize: Use fold_masks_zs in fold_orc
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 8 +++++---
+file changed, 5 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t s_mask;
++
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, -1) ||
+         fold_xi_to_x(ctx, op, -1) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
+         return true;
+     }
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
+-    return false;
++    s_mask = arg_info(op->args[1])->s_mask
++           & arg_info(op->args[2])->s_mask;
++    return fold_masks_s(ctx, op, s_mask);
+ }
+ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 11/24] tcg/optimize: Use tcg_constant_internal with constant folding
+[PULL 33/72] tcg/optimize: Use fold_masks_zs in fold_qemu_ld
+Avoid the use of the OptContext slots.
+Be careful not to call fold_masks_zs when the memory operation
+is wide enough to require multiple outputs, so split into two
+functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 108 ++++++++++++++++++++++---------------------------
+ tcg/optimize.c | 26 +++++++++++++++++++++-----
-file changed, 49 insertions(+), 59 deletions(-)
+file changed, 21 insertions(+), 5 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
-     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
+     return fold_masks_s(ctx, op, s_mask);
  }
--static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
+-static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
--{
++static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op)
 -    const TCGOpDef *def;
 -    TCGOpcode new_op;
 -    uint64_t mask;
 -    TempOptInfo *di = arg_info(dst);
 -
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_dupi_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_movi_i64;
 -    } else {
 -        new_op = INDEX_op_movi_i32;
 -    }
 -    op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 -    op->args[0] = dst;
 -    op->args[1] = val;
 -
 -    reset_temp(dst);
 -    di->is_const = true;
 -    di->val = val;
 -    mask = val;
 -    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
 -        /* High bits of the destination are now garbage.  */
 -        mask |= ~0xffffffffull;
 -    }
 -    di->mask = mask;
 -}
 -
  static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
  {
-     TCGTemp *dst_ts = arg_temp(dst);
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
 +    uint64_t z_mask = -1, s_mask = 0;
      if (width < 64) {
          if (mop & MO_SIGN) {
 -            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
 +            s_mask = MAKE_64BIT_MASK(width - 1, 64 - (width - 1));
          } else {
 -            ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +            z_mask = MAKE_64BIT_MASK(0, width);
          }
      }
- }
+     /* Opcodes that touch guest memory stop the mb optimization.  */
-+static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+     ctx->prev_mb = NULL;
-+                             TCGOp *op, TCGArg dst, uint64_t val)
+-    return false;
 +{
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    TCGType type;
 +    TCGTemp *tv;
 +
-+    if (def->flags & TCG_OPF_VECTOR) {
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
 +        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 +    } else if (def->flags & TCG_OPF_64BIT) {
 +        type = TCG_TYPE_I64;
 +    } else {
 +        type = TCG_TYPE_I32;
 +    }
 +
 +    /* Convert movi to mov with constant temp. */
 +    tv = tcg_constant_internal(type, val);
 +    init_ts_info(temps_used, tv);
 +    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +}
 +
- static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
++static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op)
- {
++{
-     uint64_t l64, h64;
++    /* Opcodes that touch guest memory stop the mb optimization.  */
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
++    ctx->prev_mb = NULL;
-     nb_temps = s->nb_temps;
++    return finish_folding(ctx, op);
-     nb_globals = s->nb_globals;
+ }
--    bitmap_zero(temps_used.l, nb_temps);
+ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
 +    memset(&temps_used, 0, sizeof(temps_used));
      for (i = 0; i < nb_temps; ++i) {
          s->temps[i].state_ptr = NULL;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(s, op, op->args[0], 0);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, op, op->args[0], 0);
 +            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, op, op->args[0], 0);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, op, op->args[0], 0);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                  continue;
              }
              break;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
-         CASE_OP_32_64(movi):
+         case INDEX_op_qemu_ld_a32_i32:
-         case INDEX_op_dupi_vec:
+         case INDEX_op_qemu_ld_a64_i32:
--            tcg_opt_gen_movi(s, op, op->args[0], op->args[1]);
++            done = fold_qemu_ld_1reg(&ctx, op);
-+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
++            break;
          case INDEX_op_qemu_ld_a32_i64:
          case INDEX_op_qemu_ld_a64_i64:
 +            if (TCG_TARGET_REG_BITS == 64) {
 +                done = fold_qemu_ld_1reg(&ctx, op);
 +                break;
 +            }
 +            QEMU_FALLTHROUGH;
          case INDEX_op_qemu_ld_a32_i128:
          case INDEX_op_qemu_ld_a64_i128:
 -            done = fold_qemu_ld(&ctx, op);
 +            done = fold_qemu_ld_2reg(&ctx, op);
              break;
+         case INDEX_op_qemu_st8_a32_i32:
-         case INDEX_op_dup_vec:
+         case INDEX_op_qemu_st8_a64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[1])->val;
                  if (tmp == arg_info(op->args[2])->val) {
 -                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                      break;
                  }
              } else if (args_are_copies(op->args[1], op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  } else {
                      tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                  }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[1], op->args[2]);
              if (tmp != 2) {
                  if (tmp) {
 -                    bitmap_zero(temps_used.l, nb_temps);
 +                    memset(&temps_used, 0, sizeof(temps_used));
                      op->opc = INDEX_op_br;
                      op->args[0] = op->args[3];
                  } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  uint64_t a = ((uint64_t)ah << 32) | al;
                  uint64_t b = ((uint64_t)bh << 32) | bl;
                  TCGArg rl, rh;
 -                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
 +                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
                  if (opc == INDEX_op_add2_i32) {
                      a += b;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  uint32_t b = arg_info(op->args[3])->val;
                  uint64_t r = (uint64_t)a * b;
                  TCGArg rl, rh;
 -                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
 +                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (tmp != 2) {
                  if (tmp) {
              do_brcond_true:
 -                    bitmap_zero(temps_used.l, nb_temps);
 +                    memset(&temps_used, 0, sizeof(temps_used));
                      op->opc = INDEX_op_br;
                      op->args[0] = op->args[5];
                  } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_brcond_high:
 -                bitmap_zero(temps_used.l, nb_temps);
 +                memset(&temps_used, 0, sizeof(temps_used));
                  op->opc = INDEX_op_brcond_i32;
                  op->args[0] = op->args[1];
                  op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      goto do_default;
                  }
              do_brcond_low:
 -                bitmap_zero(temps_used.l, nb_temps);
 +                memset(&temps_used, 0, sizeof(temps_used));
                  op->opc = INDEX_op_brcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
              } else if ((op->args[5] == TCG_COND_LT
                          || op->args[5] == TCG_COND_GE)
                         && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 block, otherwise we only trash the output args.  "mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
 -                bitmap_zero(temps_used.l, nb_temps);
 +                memset(&temps_used, 0, sizeof(temps_used));
              } else {
          do_reset_output:
                  for (i = 0; i < nb_oargs; i++) {
 --
-.25.1
+.43.0

-[PULL 06/24] tcg: Rename struct tcg_temp_info to TempOptInfo
+[PULL 34/72] tcg/optimize: Return true from fold_qemu_st, fold_tcg_st
-Fix this name vs our coding style.
+Stores have no output operands, and so need no further work.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 32 ++++++++++++++++----------------
+ tcg/optimize.c | 11 +++++------
-file changed, 16 insertions(+), 16 deletions(-)
+file changed, 5 insertions(+), 6 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
          glue(glue(case INDEX_op_, x), _i64):    \
          glue(glue(case INDEX_op_, x), _vec)
 -struct tcg_temp_info {
 +typedef struct TempOptInfo {
      bool is_const;
      TCGTemp *prev_copy;
      TCGTemp *next_copy;
      tcg_target_ulong val;
      tcg_target_ulong mask;
 -};
 +} TempOptInfo;
 -static inline struct tcg_temp_info *ts_info(TCGTemp *ts)
 +static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
-     return ts->state_ptr;
+     /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
 -    return false;
 +    return true;
  }
--static inline struct tcg_temp_info *arg_info(TCGArg arg)
+ static bool fold_remainder(OptContext *ctx, TCGOp *op)
-+static inline TempOptInfo *arg_info(TCGArg arg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
- {
-     return ts_info(arg_temp(arg));
+     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
          remove_mem_copy_all(ctx);
 -        return false;
 +        return true;
      }
      switch (op->opc) {
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
          g_assert_not_reached();
      }
      remove_mem_copy_in(ctx, ofs, ofs + lm1);
 -    return false;
 +    return true;
  }
-@@ -XXX,XX +XXX,XX @@ static inline bool ts_is_copy(TCGTemp *ts)
- /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
+ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
- static void reset_ts(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
- {
+     TCGType type;
--    struct tcg_temp_info *ti = ts_info(ts);
--    struct tcg_temp_info *pi = ts_info(ti->prev_copy);
+     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
--    struct tcg_temp_info *ni = ts_info(ti->next_copy);
+-        fold_tcg_st(ctx, op);
-+    TempOptInfo *ti = ts_info(ts);
+-        return false;
-+    TempOptInfo *pi = ts_info(ti->prev_copy);
++        return fold_tcg_st(ctx, op);
-+    TempOptInfo *ni = ts_info(ti->next_copy);
+     }
-     ni->prev_copy = ti->prev_copy;
+     src = arg_temp(op->args[0]);
-     pi->next_copy = ti->next_copy;
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+     last = ofs + tcg_type_size(type) - 1;
      remove_mem_copy_in(ctx, ofs, last);
      record_mem_copy(ctx, type, src, ofs, last);
 -    return false;
 +    return true;
  }
- /* Initialize and activate a temporary.  */
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
 -static void init_ts_info(struct tcg_temp_info *infos,
 +static void init_ts_info(TempOptInfo *infos,
                           TCGTempSet *temps_used, TCGTemp *ts)
  {
      size_t idx = temp_idx(ts);
      if (!test_bit(idx, temps_used->l)) {
 -        struct tcg_temp_info *ti = &infos[idx];
 +        TempOptInfo *ti = &infos[idx];
          ts->state_ptr = ti;
          ti->next_copy = ts;
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(struct tcg_temp_info *infos,
      }
  }
 -static void init_arg_info(struct tcg_temp_info *infos,
 +static void init_arg_info(TempOptInfo *infos,
                            TCGTempSet *temps_used, TCGArg arg)
  {
      init_ts_info(infos, temps_used, arg_temp(arg));
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
      const TCGOpDef *def;
      TCGOpcode new_op;
      tcg_target_ulong mask;
 -    struct tcg_temp_info *di = arg_info(dst);
 +    TempOptInfo *di = arg_info(dst);
      def = &tcg_op_defs[op->opc];
      if (def->flags & TCG_OPF_VECTOR) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
      const TCGOpDef *def;
 -    struct tcg_temp_info *di;
 -    struct tcg_temp_info *si;
 +    TempOptInfo *di;
 +    TempOptInfo *si;
      tcg_target_ulong mask;
      TCGOpcode new_op;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      di->mask = mask;
      if (src_ts->type == dst_ts->type) {
 -        struct tcg_temp_info *ni = ts_info(si->next_copy);
 +        TempOptInfo *ni = ts_info(si->next_copy);
          di->next_copy = si->next_copy;
          di->prev_copy = src_ts;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    struct tcg_temp_info *infos;
 +    TempOptInfo *infos;
      TCGTempSet temps_used;
      /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      nb_temps = s->nb_temps;
      nb_globals = s->nb_globals;
      bitmap_zero(temps_used.l, nb_temps);
 -    infos = tcg_malloc(sizeof(struct tcg_temp_info) * nb_temps);
 +    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
          tcg_target_ulong mask, partmask, affected;
 --
-.25.1
+.43.0

-New patch
+[PULL 35/72] tcg/optimize: Use finish_folding in fold_remainder
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+         fold_xx_to_i(ctx, op, 0)) {
+         return true;
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
+--
+.43.0

-New patch
+[PULL 36/72] tcg/optimize: Distinguish simplification in fold_setcond_zmask
+Change return from bool to int; distinguish between
+complete folding, simplification, and no change.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 22 ++++++++++++++--------
+file changed, 14 insertions(+), 8 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return finish_folding(ctx, op);
+ }
+-static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
++/* Return 1 if finished, -1 if simplified, 0 if unchanged. */
++static int fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
+ {
+     uint64_t a_zmask, b_val;
+     TCGCond cond;
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
+                 op->opc = xor_opc;
+                 op->args[2] = arg_new_constant(ctx, 1);
+             }
+-            return false;
++            return -1;
+         }
+     }
+-
+-    return false;
++    return 0;
+ }
+ static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
+         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+     }
+-    if (fold_setcond_zmask(ctx, op, false)) {
++    i = fold_setcond_zmask(ctx, op, false);
++    if (i > 0) {
+         return true;
+     }
+-    fold_setcond_tst_pow2(ctx, op, false);
++    if (i == 0) {
++        fold_setcond_tst_pow2(ctx, op, false);
++    }
+     ctx->z_mask = 1;
+     return false;
+@@ -XXX,XX +XXX,XX @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
+         return tcg_opt_gen_movi(ctx, op, op->args[0], -i);
+     }
+-    if (fold_setcond_zmask(ctx, op, true)) {
++    i = fold_setcond_zmask(ctx, op, true);
++    if (i > 0) {
+         return true;
+     }
+-    fold_setcond_tst_pow2(ctx, op, true);
++    if (i == 0) {
++        fold_setcond_tst_pow2(ctx, op, true);
++    }
+     /* Value is {0,-1} so all bits are repetitions of the sign. */
+     ctx->s_mask = -1;
+--
+.43.0

-New patch
+[PULL 37/72] tcg/optimize: Use fold_masks_z in fold_setcond
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 3 +--
+file changed, 1 insertion(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
+         fold_setcond_tst_pow2(ctx, op, false);
+     }
+-    ctx->z_mask = 1;
+-    return false;
++    return fold_masks_z(ctx, op, 1);
+ }
+ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 38/72] tcg/optimize: Use fold_masks_s in fold_negsetcond
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 3 +--
+file changed, 1 insertion(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
+     }
+     /* Value is {0,-1} so all bits are repetitions of the sign. */
+-    ctx->s_mask = -1;
+-    return false;
++    return fold_masks_s(ctx, op, -1);
+ }
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 39/72] tcg/optimize: Use fold_masks_z in fold_setcond2
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 3 +--
+file changed, 1 insertion(+), 2 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+         return fold_setcond(ctx, op);
+     }
+-    ctx->z_mask = 1;
+-    return false;
++    return fold_masks_z(ctx, op, 1);
+  do_setcond_const:
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+--
+.43.0

-New patch
+[PULL 40/72] tcg/optimize: Use finish_folding in fold_cmp_vec
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
+     if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+         op->args[3] = tcg_swap_cond(op->args[3]);
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 41/72] tcg/optimize: Use finish_folding in fold_cmpsel_vec
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
+     if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+         op->args[5] = tcg_invert_cond(op->args[5]);
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 42/72] tcg/optimize: Use fold_masks_zs in fold_sextract
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 24 +++++++++---------------
+file changed, 9 insertions(+), 15 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask, s_mask, s_mask_old;
++    TempOptInfo *t1 = arg_info(op->args[1]);
+     int pos = op->args[2];
+     int len = op->args[3];
+-    if (arg_is_const(op->args[1])) {
+-        uint64_t t;
+-
+-        t = arg_info(op->args[1])->val;
+-        t = sextract64(t, pos, len);
+-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    if (ti_is_const(t1)) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0],
++                                sextract64(ti_const_val(t1), pos, len));
+     }
+-    z_mask = arg_info(op->args[1])->z_mask;
+-    z_mask = sextract64(z_mask, pos, len);
+-    ctx->z_mask = z_mask;
+-
+-    s_mask_old = arg_info(op->args[1])->s_mask;
+-    s_mask = sextract64(s_mask_old, pos, len);
+-    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+-    ctx->s_mask = s_mask;
++    s_mask_old = t1->s_mask;
++    s_mask = s_mask_old >> pos;
++    s_mask |= -1ull << (len - 1);
+     if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+         return true;
+     }
+-    return fold_masks(ctx, op);
++    z_mask = sextract64(t1->z_mask, pos, len);
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+--
+.43.0

-[PULL 13/24] tcg: Use tcg_constant_i32 with icount expander
+[PULL 43/72] tcg/optimize: Use fold_masks_zs, fold_masks_s in fold_shift
-We must do this before we adjust tcg_out_movi_i32, lest the
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
 under-the-hood poking that we do for icount be broken.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/gen-icount.h | 25 +++++++++++++------------
+ tcg/optimize.c | 27 ++++++++++++++-------------
-file changed, 13 insertions(+), 12 deletions(-)
+file changed, 14 insertions(+), 13 deletions(-)
-diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/gen-icount.h
+--- a/tcg/optimize.c
-+++ b/include/exec/gen-icount.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void gen_io_end(void)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
  static inline void gen_tb_start(const TranslationBlock *tb)
  {
--    TCGv_i32 count, imm;
+     uint64_t s_mask, z_mask, sign;
-+    TCGv_i32 count;
++    TempOptInfo *t1, *t2;
-     tcg_ctx->exitreq_label = gen_new_label();
+     if (fold_const2(ctx, op) ||
-     if (tb_cflags(tb) & CF_USE_ICOUNT) {
+         fold_ix_to_i(ctx, op, 0) ||
-@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(const TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-                    offsetof(ArchCPU, env));
+         return true;
+     }
-     if (tb_cflags(tb) & CF_USE_ICOUNT) {
--        imm = tcg_temp_new_i32();
+-    s_mask = arg_info(op->args[1])->s_mask;
--        /* We emit a movi with a dummy immediate argument. Keep the insn index
+-    z_mask = arg_info(op->args[1])->z_mask;
--         * of the movi so that we later (when we know the actual insn count)
++    t1 = arg_info(op->args[1]);
--         * can update the immediate argument with the actual insn count.  */
++    t2 = arg_info(op->args[2]);
--        tcg_gen_movi_i32(imm, 0xdeadbeef);
++    s_mask = t1->s_mask;
-+        /*
++    z_mask = t1->z_mask;
-+         * We emit a sub with a dummy immediate argument. Keep the insn index
-+         * of the sub so that we later (when we know the actual insn count)
+-    if (arg_is_const(op->args[2])) {
-+         * can update the argument with the actual insn count.
+-        int sh = arg_info(op->args[2])->val;
 +         */
 +        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
          icount_start_insn = tcg_last_op();
 -
--        tcg_gen_sub_i32(count, count, imm);
+-        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
--        tcg_temp_free_i32(imm);
++    if (ti_is_const(t2)) {
 +        int sh = ti_const_val(t2);
 +        z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
          s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 -        return fold_masks(ctx, op);
 +        return fold_masks_zs(ctx, op, z_mask, s_mask);
      }
-     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
+     switch (op->opc) {
-@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(const TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
- static inline void gen_tb_end(const TranslationBlock *tb, int num_insns)
+          * Arithmetic right shift will not reduce the number of
- {
+          * input sign repetitions.
-     if (tb_cflags(tb) & CF_USE_ICOUNT) {
+          */
--        /* Update the num_insn immediate parameter now that we know
+-        ctx->s_mask = s_mask;
--         * the actual insn count.  */
+-        break;
--        tcg_set_insn_param(icount_start_insn, 1, num_insns);
++        return fold_masks_s(ctx, op, s_mask);
-+        /*
+     CASE_OP_32_64(shr):
-+         * Update the num_insn immediate parameter now that we know
+         /*
-+         * the actual insn count.
+          * If the sign bit is known zero, then logical right shift
-+         */
+-         * will not reduced the number of input sign repetitions.
-+        tcg_set_insn_param(icount_start_insn, 2,
++         * will not reduce the number of input sign repetitions.
-+                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
+          */
 -        sign = (s_mask & -s_mask) >> 1;
 +        sign = -s_mask;
          if (sign && !(z_mask & sign)) {
 -            ctx->s_mask = s_mask;
 +            return fold_masks_s(ctx, op, s_mask);
          }
          break;
      default:
          break;
      }
-     gen_set_label(tcg_ctx->exitreq_label);
+-    return false;
 +    return finish_folding(ctx, op);
  }
  static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
 --
-.25.1
+.43.0

-New patch
+[PULL 44/72] tcg/optimize: Simplify sign bit test in fold_shift
+Merge the two conditions, sign != 0 && !(z_mask & sign),
+by testing ~z_mask & sign.   If sign == 0, the logical and
+will produce false.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 5 ++---
+file changed, 2 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t s_mask, z_mask, sign;
++    uint64_t s_mask, z_mask;
+     TempOptInfo *t1, *t2;
+     if (fold_const2(ctx, op) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+          * If the sign bit is known zero, then logical right shift
+          * will not reduce the number of input sign repetitions.
+          */
+-        sign = -s_mask;
+-        if (sign && !(z_mask & sign)) {
++        if (~z_mask & -s_mask) {
+             return fold_masks_s(ctx, op, s_mask);
+         }
+         break;
+--
+.43.0

-[PULL 24/24] decodetree: Open files with encoding='utf-8'
+[PULL 45/72] tcg/optimize: Use finish_folding in fold_sub, fold_sub_vec
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Duplicate fold_sub_vec into fold_sub instead of calling it,
 now that fold_sub_vec always returns true.
-When decodetree.py was added in commit 568ae7efae7, QEMU was
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 using Python 2 which happily reads UTF-8 files in text mode.
 Python 3 requires either UTF-8 locale or an explicit encoding
 passed to open(). Now that Python 3 is required, explicit
 UTF-8 encoding for decodetree source files.
 To avoid further problems with the user locale, also explicit
 UTF-8 encoding for the generated C files.
 Explicit both input/output are plain text by using the 't' mode.
 This fixes:
   $ /usr/bin/python3 scripts/decodetree.py test.decode
   Traceback (most recent call last):
     File "scripts/decodetree.py", line 1397, in <module>
       main()
     File "scripts/decodetree.py", line 1308, in main
       parse_file(f, toppat)
     File "scripts/decodetree.py", line 994, in parse_file
       for line in f:
     File "/usr/lib/python3.6/encodings/ascii.py", line 26, in decode
       return codecs.ascii_decode(input, self.errors)[0]
   UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 80:
   ordinal not in range(128)
 Reported-by: Peter Maydell <peter.maydell@linaro.org>
 Suggested-by: Yonggang Luo <luoyonggang@gmail.com>
 Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20210110000240.761122-1-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- scripts/decodetree.py | 9 ++++++---
+ tcg/optimize.c | 9 ++++++---
 file changed, 6 insertions(+), 3 deletions(-)
-diff --git a/scripts/decodetree.py b/scripts/decodetree.py
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/scripts/decodetree.py
+--- a/tcg/optimize.c
-+++ b/scripts/decodetree.py
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
- # See the syntax and semantics in docs/devel/decodetree.rst.
+         fold_sub_to_neg(ctx, op)) {
- #
+         return true;
+     }
-+import io
+-    return false;
- import os
++    return finish_folding(ctx, op);
- import re
+ }
- import sys
-@@ -XXX,XX +XXX,XX @@ def main():
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
-     for filename in args:
+-    if (fold_const2(ctx, op) || fold_sub_vec(ctx, op)) {
-         input_file = filename
++    if (fold_const2(ctx, op) ||
--        f = open(filename, 'r')
++        fold_xx_to_i(ctx, op, 0) ||
-+        f = open(filename, 'rt', encoding='utf-8')
++        fold_xi_to_x(ctx, op, 0) ||
-         parse_file(f, toppat)
++        fold_sub_to_neg(ctx, op)) {
-         f.close()
+         return true;
+     }
-@@ -XXX,XX +XXX,XX @@ def main():
-         prop_size(stree)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+                    ? INDEX_op_add_i32 : INDEX_op_add_i64);
-     if output_file:
+         op->args[2] = arg_new_constant(ctx, -val);
--        output_fd = open(output_file, 'w')
+     }
-+        output_fd = open(output_file, 'wt', encoding='utf-8')
+-    return false;
-     else:
++    return finish_folding(ctx, op);
--        output_fd = sys.stdout
+ }
-+        output_fd = io.TextIOWrapper(sys.stdout.buffer,
-+                                     encoding=sys.stdout.encoding,
+ static bool fold_sub2(OptContext *ctx, TCGOp *op)
 +                                     errors="ignore")
      output_autogen()
      for n in sorted(arguments.keys()):
 --
-.25.1
+.43.0

-New patch
+[PULL 46/72] tcg/optimize: Use fold_masks_zs in fold_tcg_ld
+Avoid the use of the OptContext slots.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 16 +++++++++-------
+file changed, 9 insertions(+), 7 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2(OptContext *ctx, TCGOp *op)
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask = -1, s_mask = 0;
++
+     /* We can't do any folding with a load, but we can record bits. */
+     switch (op->opc) {
+     CASE_OP_32_64(ld8s):
+-        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
++        s_mask = INT8_MIN;
+         break;
+     CASE_OP_32_64(ld8u):
+-        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
++        z_mask = MAKE_64BIT_MASK(0, 8);
+         break;
+     CASE_OP_32_64(ld16s):
+-        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
++        s_mask = INT16_MIN;
+         break;
+     CASE_OP_32_64(ld16u):
+-        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
++        z_mask = MAKE_64BIT_MASK(0, 16);
+         break;
+     case INDEX_op_ld32s_i64:
+-        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
++        s_mask = INT32_MIN;
+         break;
+     case INDEX_op_ld32u_i64:
+-        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
++        z_mask = MAKE_64BIT_MASK(0, 32);
+         break;
+     default:
+         g_assert_not_reached();
+     }
+-    return false;
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
+ }
+ static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
+--
+.43.0

-New patch
+[PULL 47/72] tcg/optimize: Use finish_folding in fold_tcg_ld_memcopy
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
+     TCGType type;
+     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
+-        return false;
++        return finish_folding(ctx, op);
+     }
+     type = ctx->type;
+--
+.43.0

-[PULL 18/24] tcg: Remove movi and dupi opcodes
+[PULL 48/72] tcg/optimize: Use fold_masks_zs in fold_xor
-These are now completely covered by mov from a
+Avoid the use of the OptContext slots.  Find TempOptInfo once.
-TYPE_CONST temporary.
+Remove fold_masks as the function becomes unused.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-opc.h        |  3 ---
+ tcg/optimize.c | 18 ++++++++----------
- tcg/optimize.c               |  4 ----
+file changed, 8 insertions(+), 10 deletions(-)
  tcg/tcg-op-vec.c             |  1 -
  tcg/tcg.c                    | 18 +-----------------
  tcg/aarch64/tcg-target.c.inc |  3 ---
  tcg/arm/tcg-target.c.inc     |  1 -
  tcg/i386/tcg-target.c.inc    |  3 ---
  tcg/mips/tcg-target.c.inc    |  2 --
  tcg/ppc/tcg-target.c.inc     |  3 ---
  tcg/riscv/tcg-target.c.inc   |  2 --
  tcg/s390/tcg-target.c.inc    |  2 --
  tcg/sparc/tcg-target.c.inc   |  2 --
  tcg/tci/tcg-target.c.inc     |  2 --
 files changed, 1 insertion(+), 45 deletions(-)
-diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-opc.h
-+++ b/include/tcg/tcg-opc.h
-@@ -XXX,XX +XXX,XX @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
- DEF(mb, 0, 0, 1, 0)
- DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
--DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
- DEF(setcond_i32, 1, 2, 1, 0)
- DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
- /* load/store */
-@@ -XXX,XX +XXX,XX @@ DEF(ctz_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_ctz_i32))
- DEF(ctpop_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ctpop_i32))
- DEF(mov_i64, 1, 1, 0, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
--DEF(movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
- DEF(setcond_i64, 1, 2, 1, IMPL64)
- DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
- /* load/store */
-@@ -XXX,XX +XXX,XX @@ DEF(qemu_st8_i32, 0, TLADDR_ARGS + 1, 1,
- #define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
- DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
--DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
- DEF(dup_vec, 1, 1, 0, IMPLVEC)
- DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
-         CASE_OP_32_64_VEC(mov):
+     return fold_masks_zs(ctx, op, -1, s_mask);
              tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
              break;
 -        CASE_OP_32_64(movi):
 -        case INDEX_op_dupi_vec:
 -            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
 -            break;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_can_emit_vecop_list(const TCGOpcode *list,
          case INDEX_op_xor_vec:
          case INDEX_op_mov_vec:
          case INDEX_op_dup_vec:
 -        case INDEX_op_dupi_vec:
          case INDEX_op_dup2_vec:
          case INDEX_op_ld_vec:
          case INDEX_op_st_vec:
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
          return TCG_TARGET_HAS_goto_ptr;
      case INDEX_op_mov_i32:
 -    case INDEX_op_movi_i32:
      case INDEX_op_setcond_i32:
      case INDEX_op_brcond_i32:
      case INDEX_op_ld8u_i32:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
          return TCG_TARGET_REG_BITS == 32;
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i64:
      case INDEX_op_setcond_i64:
      case INDEX_op_brcond_i64:
      case INDEX_op_ld8u_i64:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
      case INDEX_op_mov_vec:
      case INDEX_op_dup_vec:
 -    case INDEX_op_dupi_vec:
      case INDEX_op_dupm_vec:
      case INDEX_op_ld_vec:
      case INDEX_op_st_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
  }
- /*
+-static bool fold_masks(OptContext *ctx, TCGOp *op)
 - * Specialized code generation for INDEX_op_movi_*.
 + * Specialized code generation for INDEX_op_mov_* with a constant.
   */
  static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                    tcg_target_ulong val, TCGLifeData arg_life,
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
      }
  }
 -static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
 -{
--    TCGTemp *ots = arg_temp(op->args[0]);
+-    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
 -    tcg_target_ulong val = op->args[1];
 -
 -    tcg_reg_alloc_do_movi(s, ots, val, op->life, op->output_pref[0]);
 -}
 -
  /*
-  * Specialized code generation for INDEX_op_mov_*.
+  * An "affected" mask bit is 0 if and only if the result is identical
-  */
+  * to the first input.  Thus if the entire mask is 0, the operation
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
-         case INDEX_op_mov_vec:
-             tcg_reg_alloc_mov(s, op);
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
-             break;
+ {
--        case INDEX_op_movi_i32:
++    uint64_t z_mask, s_mask;
--        case INDEX_op_movi_i64:
++    TempOptInfo *t1, *t2;
--        case INDEX_op_dupi_vec:
++
--            tcg_reg_alloc_movi(s, op);
+     if (fold_const2_commutative(ctx, op) ||
--            break;
+         fold_xx_to_i(ctx, op, 0) ||
-         case INDEX_op_dup_vec:
+         fold_xi_to_x(ctx, op, 0) ||
-             tcg_reg_alloc_dup(s, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
-             break;
+         return true;
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+     }
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+-    ctx->z_mask = arg_info(op->args[1])->z_mask
-+++ b/tcg/aarch64/tcg-target.c.inc
+-                | arg_info(op->args[2])->z_mask;
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+-    ctx->s_mask = arg_info(op->args[1])->s_mask
+-                & arg_info(op->args[2])->s_mask;
-     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+-    return fold_masks(ctx, op);
-     case INDEX_op_mov_i64:
++    t1 = arg_info(op->args[1]);
--    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
++    t2 = arg_info(op->args[2]);
--    case INDEX_op_movi_i64:
++    z_mask = t1->z_mask | t2->z_mask;
-     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
++    s_mask = t1->s_mask & t2->s_mask;
-     default:
++    return fold_masks_zs(ctx, op, z_mask, s_mask);
-         g_assert_not_reached();
+ }
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-         break;
+ static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32:  /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          return;
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.c.inc
 +++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 --
-.25.1
+.43.0

-New patch
+[PULL 49/72] tcg/optimize: Use finish_folding in fold_bitsel_vec
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
+             return fold_orc(ctx, op);
+         }
+     }
+-    return false;
++    return finish_folding(ctx, op);
+ }
+ /* Propagate constants and copies, fold constant expressions. */
+--
+.43.0

-New patch
+[PULL 50/72] tcg/optimize: Use finish_folding as default in tcg_optimize
+All non-default cases now finish folding within each function.
+Do the same with the default case and assert it is done after.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 6 ++----
+file changed, 2 insertions(+), 4 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             done = true;
+             break;
+         default:
++            done = finish_folding(&ctx, op);
+             break;
+         }
+-
+-        if (!done) {
+-            finish_folding(&ctx, op);
+-        }
++        tcg_debug_assert(done);
+     }
+ }
+--
+.43.0

-New patch
+[PULL 51/72] tcg/optimize: Remove z_mask, s_mask from OptContext
+All mask setting is now done with parameters via fold_masks_*.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 13 -------------
+file changed, 13 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
+     /* In flight values from optimization. */
+-    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+-    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
+     TCGType type;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool finish_folding(OptContext *ctx, TCGOp *op)
+     for (i = 0; i < nb_oargs; i++) {
+         TCGTemp *ts = arg_temp(op->args[i]);
+         reset_ts(ctx, ts);
+-        /*
+-         * Save the corresponding known-zero/sign bits mask for the
+-         * first output argument (only one supported so far).
+-         */
+-        if (i == 0) {
+-            ts_info(ts)->z_mask = ctx->z_mask;
+-        }
+     }
+     return true;
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             ctx.type = TCG_TYPE_I32;
+         }
+-        /* Assume all bits affected, no bits known zero, no sign reps. */
+-        ctx.z_mask = -1;
+-        ctx.s_mask = 0;
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.43.0

-[PULL 03/24] tcg: Consolidate 3 bits into enum TCGTempKind
+[PULL 52/72] tcg/optimize: Re-enable sign-mask optimizations
-The temp_fixed, temp_global, temp_local bits are all related.
+All instances of s_mask have been converted to the new
-Combine them into a single enumeration.
+representation.  We can now re-enable usage.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  20 +++++---
+ tcg/optimize.c | 4 ++--
- tcg/optimize.c    |   8 +--
+file changed, 2 insertions(+), 2 deletions(-)
  tcg/tcg.c         | 126 ++++++++++++++++++++++++++++------------------
 files changed, 92 insertions(+), 62 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
-+++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempVal {
-     TEMP_VAL_CONST,
- } TCGTempVal;
-+typedef enum TCGTempKind {
-+    /* Temp is dead at the end of all basic blocks. */
-+    TEMP_NORMAL,
-+    /* Temp is saved across basic blocks but dead at the end of TBs. */
-+    TEMP_LOCAL,
-+    /* Temp is saved across both basic blocks and translation blocks. */
-+    TEMP_GLOBAL,
-+    /* Temp is in a fixed register. */
-+    TEMP_FIXED,
-+} TCGTempKind;
-+
- typedef struct TCGTemp {
-     TCGReg reg:8;
-     TCGTempVal val_type:8;
-     TCGType base_type:8;
-     TCGType type:8;
--    unsigned int fixed_reg:1;
-+    TCGTempKind kind:3;
-     unsigned int indirect_reg:1;
-     unsigned int indirect_base:1;
-     unsigned int mem_coherent:1;
-     unsigned int mem_allocated:1;
--    /* If true, the temp is saved across both basic blocks and
--       translation blocks.  */
--    unsigned int temp_global:1;
--    /* If true, the temp is saved across basic blocks but dead
--       at the end of translation blocks.  If false, the temp is
--       dead at the end of basic blocks.  */
--    unsigned int temp_local:1;
-     unsigned int temp_allocated:1;
-     tcg_target_long val;
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
-     TCGTemp *i;
+         g_assert_not_reached();
      /* If this is already a global, we can't do better. */
 -    if (ts->temp_global) {
 +    if (ts->kind >= TEMP_GLOBAL) {
          return ts;
      }
-     /* Search for a global first. */
+-    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
++    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
--        if (i->temp_global) {
+         return true;
 +        if (i->kind >= TEMP_GLOBAL) {
              return i;
          }
      }
-     /* If it is a temp, search for a temp local. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
--    if (!ts->temp_local) {
+     s_mask = s_mask_old >> pos;
-+    if (ts->kind == TEMP_NORMAL) {
+     s_mask |= -1ull << (len - 1);
-         for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
--            if (ts->temp_local) {
+-    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-+            if (i->kind >= TEMP_LOCAL) {
++    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
-                 return i;
+         return true;
              }
          }
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static inline TCGTemp *tcg_global_alloc(TCGContext *s)
      tcg_debug_assert(s->nb_globals == s->nb_temps);
      s->nb_globals++;
      ts = tcg_temp_alloc(s);
 -    ts->temp_global = 1;
 +    ts->kind = TEMP_GLOBAL;
      return ts;
  }
@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
      ts = tcg_global_alloc(s);
      ts->base_type = type;
      ts->type = type;
 -    ts->fixed_reg = 1;
 +    ts->kind = TEMP_FIXED;
      ts->reg = reg;
      ts->name = name;
      tcg_regset_set_reg(s->reserved_regs, reg);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
      bigendian = 1;
  #endif
 -    if (!base_ts->fixed_reg) {
 +    if (base_ts->kind != TEMP_FIXED) {
          /* We do not support double-indirect registers.  */
          tcg_debug_assert(!base_ts->indirect_reg);
          base_ts->indirect_base = 1;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
  TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
  {
      TCGContext *s = tcg_ctx;
 +    TCGTempKind kind = temp_local ? TEMP_LOCAL : TEMP_NORMAL;
      TCGTemp *ts;
      int idx, k;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
          ts = &s->temps[idx];
          ts->temp_allocated = 1;
          tcg_debug_assert(ts->base_type == type);
 -        tcg_debug_assert(ts->temp_local == temp_local);
 +        tcg_debug_assert(ts->kind == kind);
      } else {
          ts = tcg_temp_alloc(s);
          if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
              ts->base_type = type;
              ts->type = TCG_TYPE_I32;
              ts->temp_allocated = 1;
 -            ts->temp_local = temp_local;
 +            ts->kind = kind;
              tcg_debug_assert(ts2 == ts + 1);
              ts2->base_type = TCG_TYPE_I64;
              ts2->type = TCG_TYPE_I32;
              ts2->temp_allocated = 1;
 -            ts2->temp_local = temp_local;
 +            ts2->kind = kind;
          } else {
              ts->base_type = type;
              ts->type = type;
              ts->temp_allocated = 1;
 -            ts->temp_local = temp_local;
 +            ts->kind = kind;
          }
      }
-@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
-     }
- #endif
--    tcg_debug_assert(ts->temp_global == 0);
-+    tcg_debug_assert(ts->kind < TEMP_GLOBAL);
-     tcg_debug_assert(ts->temp_allocated != 0);
-     ts->temp_allocated = 0;
-     idx = temp_idx(ts);
--    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
-+    k = ts->base_type + (ts->kind == TEMP_NORMAL ? 0 : TCG_TYPE_COUNT);
-     set_bit(idx, s->free_temps[k].l);
- }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
- static void tcg_reg_alloc_start(TCGContext *s)
- {
-     int i, n;
--    TCGTemp *ts;
--    for (i = 0, n = s->nb_globals; i < n; i++) {
--        ts = &s->temps[i];
--        ts->val_type = (ts->fixed_reg ? TEMP_VAL_REG : TEMP_VAL_MEM);
--    }
--    for (n = s->nb_temps; i < n; i++) {
--        ts = &s->temps[i];
--        ts->val_type = (ts->temp_local ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
--        ts->mem_allocated = 0;
--        ts->fixed_reg = 0;
-+    for (i = 0, n = s->nb_temps; i < n; i++) {
-+        TCGTemp *ts = &s->temps[i];
-+        TCGTempVal val = TEMP_VAL_MEM;
-+
-+        switch (ts->kind) {
-+        case TEMP_FIXED:
-+            val = TEMP_VAL_REG;
-+            break;
-+        case TEMP_GLOBAL:
-+            break;
-+        case TEMP_NORMAL:
-+            val = TEMP_VAL_DEAD;
-+            /* fall through */
-+        case TEMP_LOCAL:
-+            ts->mem_allocated = 0;
-+            break;
-+        default:
-+            g_assert_not_reached();
-+        }
-+        ts->val_type = val;
-     }
-     memset(s->reg_to_temp, 0, sizeof(s->reg_to_temp));
-@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
- {
-     int idx = temp_idx(ts);
--    if (ts->temp_global) {
-+    switch (ts->kind) {
-+    case TEMP_FIXED:
-+    case TEMP_GLOBAL:
-         pstrcpy(buf, buf_size, ts->name);
--    } else if (ts->temp_local) {
-+        break;
-+    case TEMP_LOCAL:
-         snprintf(buf, buf_size, "loc%d", idx - s->nb_globals);
--    } else {
-+        break;
-+    case TEMP_NORMAL:
-         snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
-+        break;
-     }
-     return buf;
- }
-@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
- {
-     int i;
--    for (i = 0; i < ng; ++i) {
--        s->temps[i].state = TS_DEAD | TS_MEM;
--        la_reset_pref(&s->temps[i]);
--    }
--    for (i = ng; i < nt; ++i) {
--        s->temps[i].state = (s->temps[i].temp_local
--                             ? TS_DEAD | TS_MEM
--                             : TS_DEAD);
--        la_reset_pref(&s->temps[i]);
-+    for (i = 0; i < nt; ++i) {
-+        TCGTemp *ts = &s->temps[i];
-+        int state;
-+
-+        switch (ts->kind) {
-+        case TEMP_FIXED:
-+        case TEMP_GLOBAL:
-+        case TEMP_LOCAL:
-+            state = TS_DEAD | TS_MEM;
-+            break;
-+        case TEMP_NORMAL:
-+            state = TS_DEAD;
-+            break;
-+        default:
-+            g_assert_not_reached();
-+        }
-+        ts->state = state;
-+        la_reset_pref(ts);
-     }
- }
-@@ -XXX,XX +XXX,XX @@ static void la_bb_sync(TCGContext *s, int ng, int nt)
-     la_global_sync(s, ng);
-     for (int i = ng; i < nt; ++i) {
--        if (s->temps[i].temp_local) {
-+        if (s->temps[i].kind == TEMP_LOCAL) {
-             int state = s->temps[i].state;
-             s->temps[i].state = state | TS_MEM;
-             if (state != TS_DEAD) {
-@@ -XXX,XX +XXX,XX @@ static void check_regs(TCGContext *s)
-     }
-     for (k = 0; k < s->nb_temps; k++) {
-         ts = &s->temps[k];
--        if (ts->val_type == TEMP_VAL_REG && !ts->fixed_reg
-+        if (ts->val_type == TEMP_VAL_REG
-+            && ts->kind != TEMP_FIXED
-             && s->reg_to_temp[ts->reg] != ts) {
-             printf("Inconsistency for temp %s:\n",
-                    tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
-@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
-    mark it free; otherwise mark it dead.  */
- static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
- {
--    if (ts->fixed_reg) {
-+    if (ts->kind == TEMP_FIXED) {
-         return;
-     }
-     if (ts->val_type == TEMP_VAL_REG) {
-         s->reg_to_temp[ts->reg] = NULL;
-     }
-     ts->val_type = (free_or_dead < 0
--                    || ts->temp_local
--                    || ts->temp_global
-+                    || ts->kind != TEMP_NORMAL
-                     ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
- }
-@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
- static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
-                       TCGRegSet preferred_regs, int free_or_dead)
- {
--    if (ts->fixed_reg) {
-+    if (ts->kind == TEMP_FIXED) {
-         return;
-     }
-     if (!ts->mem_coherent) {
-@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
- {
-     /* The liveness analysis already ensures that globals are back
-        in memory. Keep an tcg_debug_assert for safety. */
--    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg);
-+    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
-+                     || ts->kind == TEMP_FIXED);
- }
- /* save globals to their canonical location and assume they can be
-@@ -XXX,XX +XXX,XX @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
-     for (i = 0, n = s->nb_globals; i < n; i++) {
-         TCGTemp *ts = &s->temps[i];
-         tcg_debug_assert(ts->val_type != TEMP_VAL_REG
--                         || ts->fixed_reg
-+                         || ts->kind == TEMP_FIXED
-                          || ts->mem_coherent);
-     }
- }
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
-     for (i = s->nb_globals; i < s->nb_temps; i++) {
-         TCGTemp *ts = &s->temps[i];
--        if (ts->temp_local) {
-+        if (ts->kind == TEMP_LOCAL) {
-             temp_save(s, ts, allocated_regs);
-         } else {
-             /* The liveness analysis already ensures that temps are dead.
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
-          * The liveness analysis already ensures that temps are dead.
-          * Keep tcg_debug_asserts for safety.
-          */
--        if (ts->temp_local) {
-+        if (ts->kind == TEMP_LOCAL) {
-             tcg_debug_assert(ts->val_type != TEMP_VAL_REG || ts->mem_coherent);
-         } else {
-             tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
-                                   TCGRegSet preferred_regs)
- {
-     /* ENV should not be modified.  */
--    tcg_debug_assert(!ots->fixed_reg);
-+    tcg_debug_assert(ots->kind != TEMP_FIXED);
-     /* The movi is not explicitly generated here.  */
-     if (ots->val_type == TEMP_VAL_REG) {
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
-     ts = arg_temp(op->args[1]);
-     /* ENV should not be modified.  */
--    tcg_debug_assert(!ots->fixed_reg);
-+    tcg_debug_assert(ots->kind != TEMP_FIXED);
-     /* Note that otype != itype for no-op truncation.  */
-     otype = ots->type;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
-         }
-         temp_dead(s, ots);
-     } else {
--        if (IS_DEAD_ARG(1) && !ts->fixed_reg) {
-+        if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
-             /* the mov can be suppressed */
-             if (ots->val_type == TEMP_VAL_REG) {
-                 s->reg_to_temp[ots->reg] = NULL;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
-                  * Store the source register into the destination slot
-                  * and leave the destination temp as TEMP_VAL_MEM.
-                  */
--                assert(!ots->fixed_reg);
-+                assert(ots->kind != TEMP_FIXED);
-                 if (!ts->mem_allocated) {
-                     temp_allocate_frame(s, ots);
-                 }
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
-     its = arg_temp(op->args[1]);
-     /* ENV should not be modified.  */
--    tcg_debug_assert(!ots->fixed_reg);
-+    tcg_debug_assert(ots->kind != TEMP_FIXED);
-     itype = its->type;
-     vece = TCGOP_VECE(op);
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
-         i_preferred_regs = o_preferred_regs = 0;
-         if (arg_ct->ialias) {
-             o_preferred_regs = op->output_pref[arg_ct->alias_index];
--            if (ts->fixed_reg) {
-+            if (ts->kind == TEMP_FIXED) {
-                 /* if fixed register, we must allocate a new register
-                    if the alias is not the same register */
-                 if (arg != op->args[arg_ct->alias_index]) {
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
-             ts = arg_temp(arg);
-             /* ENV should not be modified.  */
--            tcg_debug_assert(!ts->fixed_reg);
-+            tcg_debug_assert(ts->kind != TEMP_FIXED);
-             if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
-                 reg = new_args[arg_ct->alias_index];
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
-         ts = arg_temp(op->args[i]);
-         /* ENV should not be modified.  */
--        tcg_debug_assert(!ts->fixed_reg);
-+        tcg_debug_assert(ts->kind != TEMP_FIXED);
-         if (NEED_SYNC_ARG(i)) {
-             temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
-         ts = arg_temp(arg);
-         /* ENV should not be modified.  */
--        tcg_debug_assert(!ts->fixed_reg);
-+        tcg_debug_assert(ts->kind != TEMP_FIXED);
-         reg = tcg_target_call_oarg_regs[i];
-         tcg_debug_assert(s->reg_to_temp[reg] == NULL);
 --
-.25.1
+.43.0

-[PULL 10/24] tcg/optimize: Adjust TempOptInfo allocation
+[PULL 53/72] tcg/optimize: Move fold_bitsel_vec into alphabetic sort
-Do not allocate a large block for indexing.  Instead, allocate
+The big comment just above says functions should be sorted.
-for each temporary as they are seen.
+Add forward declarations as needed.
-In general, this will use less memory, if we consider that most
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 TBs do not touch every target register.  This also allows us to
 allocate TempOptInfo for new temps created during optimization.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 60 ++++++++++++++++++++++++++++----------------------
+ tcg/optimize.c | 114 +++++++++++++++++++++++++------------------------
-file changed, 34 insertions(+), 26 deletions(-)
+file changed, 59 insertions(+), 55 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
   *   3) those that produce information about the result value.
   */
 +static bool fold_or(OptContext *ctx, TCGOp *op);
 +static bool fold_orc(OptContext *ctx, TCGOp *op);
 +static bool fold_xor(OptContext *ctx, TCGOp *op);
 +
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2_commutative(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
      return fold_masks_zs(ctx, op, z_mask, s_mask);
  }
- /* Initialize and activate a temporary.  */
++static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
--static void init_ts_info(TempOptInfo *infos,
++{
--                         TCGTempSet *temps_used, TCGTemp *ts)
++    /* If true and false values are the same, eliminate the cmp. */
-+static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
++    if (args_are_copies(op->args[2], op->args[3])) {
- {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
      size_t idx = temp_idx(ts);
 -    if (!test_bit(idx, temps_used->l)) {
 -        TempOptInfo *ti = &infos[idx];
 +    TempOptInfo *ti;
 +    if (test_bit(idx, temps_used->l)) {
 +        return;
 +    }
 +    set_bit(idx, temps_used->l);
 +
 +    ti = ts->state_ptr;
 +    if (ti == NULL) {
 +        ti = tcg_malloc(sizeof(TempOptInfo));
          ts->state_ptr = ti;
 -        ti->next_copy = ts;
 -        ti->prev_copy = ts;
 -        if (ts->kind == TEMP_CONST) {
 -            ti->is_const = true;
 -            ti->val = ti->mask = ts->val;
 -            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
 -                /* High bits of a 32-bit quantity are garbage.  */
 -                ti->mask |= ~0xffffffffull;
 -            }
 -        } else {
 -            ti->is_const = false;
 -            ti->mask = -1;
 +    }
 +
-+    ti->next_copy = ts;
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-+    ti->prev_copy = ts;
++        uint64_t tv = arg_info(op->args[2])->val;
-+    if (ts->kind == TEMP_CONST) {
++        uint64_t fv = arg_info(op->args[3])->val;
-+        ti->is_const = true;
++
-+        ti->val = ts->val;
++        if (tv == -1 && fv == 0) {
-+        ti->mask = ts->val;
++            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-+        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
++        }
-+            /* High bits of a 32-bit quantity are garbage.  */
++        if (tv == 0 && fv == -1) {
-+            ti->mask |= ~0xffffffffull;
++            if (TCG_TARGET_HAS_not_vec) {
-         }
++                op->opc = INDEX_op_not_vec;
--        set_bit(idx, temps_used->l);
++                return fold_not(ctx, op);
-+    } else {
++            } else {
-+        ti->is_const = false;
++                op->opc = INDEX_op_xor_vec;
-+        ti->mask = -1;
++                op->args[2] = arg_new_constant(ctx, -1);
-     }
++                return fold_xor(ctx, op);
 +            }
 +        }
 +    }
 +    if (arg_is_const(op->args[2])) {
 +        uint64_t tv = arg_info(op->args[2])->val;
 +        if (tv == -1) {
 +            op->opc = INDEX_op_or_vec;
 +            op->args[2] = op->args[3];
 +            return fold_or(ctx, op);
 +        }
 +        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
 +            op->opc = INDEX_op_andc_vec;
 +            op->args[2] = op->args[1];
 +            op->args[1] = op->args[3];
 +            return fold_andc(ctx, op);
 +        }
 +    }
 +    if (arg_is_const(op->args[3])) {
 +        uint64_t fv = arg_info(op->args[3])->val;
 +        if (fv == 0) {
 +            op->opc = INDEX_op_and_vec;
 +            return fold_and(ctx, op);
 +        }
 +        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
 +            op->opc = INDEX_op_orc_vec;
 +            op->args[2] = op->args[1];
 +            op->args[1] = op->args[3];
 +            return fold_orc(ctx, op);
 +        }
 +    }
 +    return finish_folding(ctx, op);
 +}
 +
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      int i = do_constant_folding_cond1(ctx, op, NO_DEST, &op->args[0],
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
      return fold_masks_zs(ctx, op, z_mask, s_mask);
  }
--static void init_arg_info(TempOptInfo *infos,
+-static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
--                          TCGTempSet *temps_used, TCGArg arg)
+-{
-+static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+-    /* If true and false values are the same, eliminate the cmp. */
- {
+-    if (args_are_copies(op->args[2], op->args[3])) {
--    init_ts_info(infos, temps_used, arg_temp(arg));
+-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
-+    init_ts_info(temps_used, arg_temp(arg));
+-    }
- }
+-
+-    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
- static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+-        uint64_t tv = arg_info(op->args[2])->val;
-@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+-        uint64_t fv = arg_info(op->args[3])->val;
 -
 -        if (tv == -1 && fv == 0) {
 -            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 -        }
 -        if (tv == 0 && fv == -1) {
 -            if (TCG_TARGET_HAS_not_vec) {
 -                op->opc = INDEX_op_not_vec;
 -                return fold_not(ctx, op);
 -            } else {
 -                op->opc = INDEX_op_xor_vec;
 -                op->args[2] = arg_new_constant(ctx, -1);
 -                return fold_xor(ctx, op);
 -            }
 -        }
 -    }
 -    if (arg_is_const(op->args[2])) {
 -        uint64_t tv = arg_info(op->args[2])->val;
 -        if (tv == -1) {
 -            op->opc = INDEX_op_or_vec;
 -            op->args[2] = op->args[3];
 -            return fold_or(ctx, op);
 -        }
 -        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
 -            op->opc = INDEX_op_andc_vec;
 -            op->args[2] = op->args[1];
 -            op->args[1] = op->args[3];
 -            return fold_andc(ctx, op);
 -        }
 -    }
 -    if (arg_is_const(op->args[3])) {
 -        uint64_t fv = arg_info(op->args[3])->val;
 -        if (fv == 0) {
 -            op->opc = INDEX_op_and_vec;
 -            return fold_and(ctx, op);
 -        }
 -        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
 -            op->opc = INDEX_op_orc_vec;
 -            op->args[2] = op->args[1];
 -            op->args[1] = op->args[3];
 -            return fold_orc(ctx, op);
 -        }
 -    }
 -    return finish_folding(ctx, op);
 -}
 -
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
--    int nb_temps, nb_globals;
-+    int nb_temps, nb_globals, i;
-     TCGOp *op, *op_next, *prev_mb = NULL;
--    TempOptInfo *infos;
-     TCGTempSet temps_used;
-     /* Array VALS has an element for each temp.
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     nb_temps = s->nb_temps;
-     nb_globals = s->nb_globals;
-+
-     bitmap_zero(temps_used.l, nb_temps);
--    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
-+    for (i = 0; i < nb_temps; ++i) {
-+        s->temps[i].state_ptr = NULL;
-+    }
-     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-         uint64_t mask, partmask, affected, tmp;
--        int nb_oargs, nb_iargs, i;
-+        int nb_oargs, nb_iargs;
-         TCGOpcode opc = op->opc;
-         const TCGOpDef *def = &tcg_op_defs[opc];
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                 TCGTemp *ts = arg_temp(op->args[i]);
-                 if (ts) {
--                    init_ts_info(infos, &temps_used, ts);
-+                    init_ts_info(&temps_used, ts);
-                 }
-             }
-         } else {
-             nb_oargs = def->nb_oargs;
-             nb_iargs = def->nb_iargs;
-             for (i = 0; i < nb_oargs + nb_iargs; i++) {
--                init_arg_info(infos, &temps_used, op->args[i]);
-+                init_arg_info(&temps_used, op->args[i]);
-             }
-         }
 --
-.25.1
+.43.0

-[PULL 19/24] tcg: Add tcg_reg_alloc_dup2
+[PULL 54/72] tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
-There are several ways we can expand a vector dup of a 64-bit
+The big comment just above says functions should be sorted.
 element on a 32-bit host.
+Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ tcg/optimize.c | 60 +++++++++++++++++++++++++-------------------------
-file changed, 97 insertions(+)
+file changed, 30 insertions(+), 30 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
-     }
+     return true;
  }
-+static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
++static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
 +{
-+    const TCGLifeData arg_life = op->life;
++    /* Canonicalize the comparison to put immediate second. */
-+    TCGTemp *ots, *itsl, *itsh;
++    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-+    TCGType vtype = TCGOP_VECL(op) + TCG_TYPE_V64;
++        op->args[3] = tcg_swap_cond(op->args[3]);
 +    }
 +    return finish_folding(ctx, op);
 +}
 +
-+    /* This opcode is only valid for 32-bit hosts, for 64-bit elements. */
++static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
-+    tcg_debug_assert(TCG_TARGET_REG_BITS == 32);
++{
-+    tcg_debug_assert(TCGOP_VECE(op) == MO_64);
++    /* If true and false values are the same, eliminate the cmp. */
-+
++    if (args_are_copies(op->args[3], op->args[4])) {
-+    ots = arg_temp(op->args[0]);
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
 +    itsl = arg_temp(op->args[1]);
 +    itsh = arg_temp(op->args[2]);
 +
 +    /* ENV should not be modified.  */
 +    tcg_debug_assert(!temp_readonly(ots));
 +
 +    /* Allocate the output register now.  */
 +    if (ots->val_type != TEMP_VAL_REG) {
 +        TCGRegSet allocated_regs = s->reserved_regs;
 +        TCGRegSet dup_out_regs =
 +            tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
 +
 +        /* Make sure to not spill the input registers. */
 +        if (!IS_DEAD_ARG(1) && itsl->val_type == TEMP_VAL_REG) {
 +            tcg_regset_set_reg(allocated_regs, itsl->reg);
 +        }
 +        if (!IS_DEAD_ARG(2) && itsh->val_type == TEMP_VAL_REG) {
 +            tcg_regset_set_reg(allocated_regs, itsh->reg);
 +        }
 +
 +        ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 +                                 op->output_pref[0], ots->indirect_base);
 +        ots->val_type = TEMP_VAL_REG;
 +        ots->mem_coherent = 0;
 +        s->reg_to_temp[ots->reg] = ots;
 +    }
 +
-+    /* Promote dup2 of immediates to dupi_vec. */
++    /* Canonicalize the comparison to put immediate second. */
-+    if (itsl->val_type == TEMP_VAL_CONST && itsh->val_type == TEMP_VAL_CONST) {
++    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-+        uint64_t val = deposit64(itsl->val, 32, 32, itsh->val);
++        op->args[5] = tcg_swap_cond(op->args[5]);
 +        MemOp vece = MO_64;
 +
 +        if (val == dup_const(MO_8, val)) {
 +            vece = MO_8;
 +        } else if (val == dup_const(MO_16, val)) {
 +            vece = MO_16;
 +        } else if (val == dup_const(MO_32, val)) {
 +            vece = MO_32;
 +        }
 +
 +        tcg_out_dupi_vec(s, vtype, vece, ots->reg, val);
 +        goto done;
 +    }
-+
++    /*
-+    /* If the two inputs form one 64-bit value, try dupm_vec. */
++     * Canonicalize the "false" input reg to match the destination,
-+    if (itsl + 1 == itsh && itsl->base_type == TCG_TYPE_I64) {
++     * so that the tcg backend can implement "move if true".
-+        if (!itsl->mem_coherent) {
++     */
-+            temp_sync(s, itsl, s->reserved_regs, 0, 0);
++    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-+        }
++        op->args[5] = tcg_invert_cond(op->args[5]);
 +        if (!itsh->mem_coherent) {
 +            temp_sync(s, itsh, s->reserved_regs, 0, 0);
 +        }
 +#ifdef HOST_WORDS_BIGENDIAN
 +        TCGTemp *its = itsh;
 +#else
 +        TCGTemp *its = itsl;
 +#endif
 +        if (tcg_out_dupm_vec(s, vtype, MO_64, ots->reg,
 +                             its->mem_base->reg, its->mem_offset)) {
 +            goto done;
 +        }
 +    }
-+
++    return finish_folding(ctx, op);
 +    /* Fall back to generic expansion. */
 +    return false;
 +
 + done:
 +    if (IS_DEAD_ARG(1)) {
 +        temp_dead(s, itsl);
 +    }
 +    if (IS_DEAD_ARG(2)) {
 +        temp_dead(s, itsh);
 +    }
 +    if (NEED_SYNC_ARG(0)) {
 +        temp_sync(s, ots, s->reserved_regs, 0, IS_DEAD_ARG(0));
 +    } else if (IS_DEAD_ARG(0)) {
 +        temp_dead(s, ots);
 +    }
 +    return true;
 +}
 +
- #ifdef TCG_TARGET_STACK_GROWSUP
+ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
- #define STACK_DIR(x) (-(x))
+ {
- #else
+     uint64_t z_mask, s_mask;
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-         case INDEX_op_call:
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
-             tcg_reg_alloc_call(s, op);
+ }
-             break;
-+        case INDEX_op_dup2_vec:
+-static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
-+            if (tcg_reg_alloc_dup2(s, op)) {
+-{
-+                break;
+-    /* Canonicalize the comparison to put immediate second. */
-+            }
+-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-+            /* fall through */
+-        op->args[3] = tcg_swap_cond(op->args[3]);
-         default:
+-    }
-             /* Sanity check that we've not introduced any unhandled opcodes. */
+-    return finish_folding(ctx, op);
-             tcg_debug_assert(tcg_op_supported(opc));
+-}
 -
 -static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
 -{
 -    /* If true and false values are the same, eliminate the cmp. */
 -    if (args_are_copies(op->args[3], op->args[4])) {
 -        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
 -    }
 -
 -    /* Canonicalize the comparison to put immediate second. */
 -    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 -        op->args[5] = tcg_swap_cond(op->args[5]);
 -    }
 -    /*
 -     * Canonicalize the "false" input reg to match the destination,
 -     * so that the tcg backend can implement "move if true".
 -     */
 -    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -        op->args[5] = tcg_invert_cond(op->args[5]);
 -    }
 -    return finish_folding(ctx, op);
 -}
 -
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask, s_mask, s_mask_old;
 --
-.25.1
+.43.0

-New patch
+[PULL 55/72] softfloat: Add float{16,32,64}_muladd_scalbn
+We currently have a flag, float_muladd_halve_result, to scale
 the result by 2**-1.  Extend this to handle arbitrary scaling.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat.h   |  6 ++++
  fpu/softfloat.c           | 58 ++++++++++++++++++++++-----------------
  fpu/softfloat-parts.c.inc |  7 +++--
 files changed, 44 insertions(+), 27 deletions(-)
 diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat.h
 +++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ float16 float16_add(float16, float16, float_status *status);
  float16 float16_sub(float16, float16, float_status *status);
  float16 float16_mul(float16, float16, float_status *status);
  float16 float16_muladd(float16, float16, float16, int, float_status *status);
 +float16 float16_muladd_scalbn(float16, float16, float16,
 +                              int, int, float_status *status);
  float16 float16_div(float16, float16, float_status *status);
  float16 float16_scalbn(float16, int, float_status *status);
  float16 float16_min(float16, float16, float_status *status);
@@ -XXX,XX +XXX,XX @@ float32 float32_mul(float32, float32, float_status *status);
  float32 float32_div(float32, float32, float_status *status);
  float32 float32_rem(float32, float32, float_status *status);
  float32 float32_muladd(float32, float32, float32, int, float_status *status);
 +float32 float32_muladd_scalbn(float32, float32, float32,
 +                              int, int, float_status *status);
  float32 float32_sqrt(float32, float_status *status);
  float32 float32_exp2(float32, float_status *status);
  float32 float32_log2(float32, float_status *status);
@@ -XXX,XX +XXX,XX @@ float64 float64_mul(float64, float64, float_status *status);
  float64 float64_div(float64, float64, float_status *status);
  float64 float64_rem(float64, float64, float_status *status);
  float64 float64_muladd(float64, float64, float64, int, float_status *status);
 +float64 float64_muladd_scalbn(float64, float64, float64,
 +                              int, int, float_status *status);
  float64 float64_sqrt(float64, float_status *status);
  float64 float64_log2(float64, float_status *status);
  FloatRelation float64_compare(float64, float64, float_status *status);
 diff --git a/fpu/softfloat.c b/fpu/softfloat.c
 index XXXXXXX..XXXXXXX 100644
 --- a/fpu/softfloat.c
 +++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
  #define parts_mul(A, B, S) \
      PARTS_GENERIC_64_128(mul, A)(A, B, S)
 -static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
 -                                    FloatParts64 *c, int flags,
 -                                    float_status *s);
 -static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
 -                                      FloatParts128 *c, int flags,
 -                                      float_status *s);
 +static FloatParts64 *parts64_muladd_scalbn(FloatParts64 *a, FloatParts64 *b,
 +                                           FloatParts64 *c, int scale,
 +                                           int flags, float_status *s);
 +static FloatParts128 *parts128_muladd_scalbn(FloatParts128 *a, FloatParts128 *b,
 +                                             FloatParts128 *c, int scale,
 +                                             int flags, float_status *s);
 -#define parts_muladd(A, B, C, Z, S) \
 -    PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
 +#define parts_muladd_scalbn(A, B, C, Z, Y, S) \
 +    PARTS_GENERIC_64_128(muladd_scalbn, A)(A, B, C, Z, Y, S)
  static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
                                   float_status *s);
@@ -XXX,XX +XXX,XX @@ floatx80_mul(floatx80 a, floatx80 b, float_status *status)
   * Fused multiply-add
   */
 -float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
 -                                    int flags, float_status *status)
 +float16 QEMU_FLATTEN
 +float16_muladd_scalbn(float16 a, float16 b, float16 c,
 +                      int scale, int flags, float_status *status)
  {
      FloatParts64 pa, pb, pc, *pr;
      float16_unpack_canonical(&pa, a, status);
      float16_unpack_canonical(&pb, b, status);
      float16_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
      return float16_round_pack_canonical(pr, status);
  }
 -static float32 QEMU_SOFTFLOAT_ATTR
 -soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
 -                float_status *status)
 +float16 float16_muladd(float16 a, float16 b, float16 c,
 +                       int flags, float_status *status)
 +{
 +    return float16_muladd_scalbn(a, b, c, 0, flags, status);
 +}
 +
 +float32 QEMU_SOFTFLOAT_ATTR
 +float32_muladd_scalbn(float32 a, float32 b, float32 c,
 +                      int scale, int flags, float_status *status)
  {
      FloatParts64 pa, pb, pc, *pr;
      float32_unpack_canonical(&pa, a, status);
      float32_unpack_canonical(&pb, b, status);
      float32_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
      return float32_round_pack_canonical(pr, status);
  }
 -static float64 QEMU_SOFTFLOAT_ATTR
 -soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
 -                float_status *status)
 +float64 QEMU_SOFTFLOAT_ATTR
 +float64_muladd_scalbn(float64 a, float64 b, float64 c,
 +                      int scale, int flags, float_status *status)
  {
      FloatParts64 pa, pb, pc, *pr;
      float64_unpack_canonical(&pa, a, status);
      float64_unpack_canonical(&pb, b, status);
      float64_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
      return float64_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
      return ur.s;
   soft:
 -    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
 +    return float32_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
  }
  float64 QEMU_FLATTEN
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
      return ur.s;
   soft:
 -    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
 +    return float64_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
  }
  float64 float64r32_muladd(float64 a, float64 b, float64 c,
@@ -XXX,XX +XXX,XX @@ float64 float64r32_muladd(float64 a, float64 b, float64 c,
      float64_unpack_canonical(&pa, a, status);
      float64_unpack_canonical(&pb, b, status);
      float64_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
      return float64r32_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
      bfloat16_unpack_canonical(&pa, a, status);
      bfloat16_unpack_canonical(&pb, b, status);
      bfloat16_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
      return bfloat16_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
      float128_unpack_canonical(&pa, a, status);
      float128_unpack_canonical(&pb, b, status);
      float128_unpack_canonical(&pc, c, status);
 -    pr = parts_muladd(&pa, &pb, &pc, flags, status);
 +    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
      return float128_round_pack_canonical(pr, status);
  }
@@ -XXX,XX +XXX,XX @@ float32 float32_exp2(float32 a, float_status *status)
      float64_unpack_canonical(&rp, float64_one, status);
      for (i = 0 ; i < 15 ; i++) {
 +
          float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status);
 -        rp = *parts_muladd(&tp, &xnp, &rp, 0, status);
 +        rp = *parts_muladd_scalbn(&tp, &xnp, &rp, 0, 0, status);
          xnp = *parts_mul(&xnp, &xp, status);
      }
 diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/fpu/softfloat-parts.c.inc
 +++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
   * Requires A and C extracted into a double-sized structure to provide the
   * extra space for the widening multiply.
   */
 -static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
 -                                   FloatPartsN *c, int flags, float_status *s)
 +static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
 +                                          FloatPartsN *c, int scale,
 +                                          int flags, float_status *s)
  {
      int ab_mask, abc_mask;
      FloatPartsW p_widen, c_widen;
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
      a->exp = p_widen.exp;
   return_normal:
 +    /* TODO: Replace all use of float_muladd_halve_result with scale. */
      if (flags & float_muladd_halve_result) {
          a->exp -= 1;
      }
 +    a->exp += scale;
   finish_sign:
      if (flags & float_muladd_negate_result) {
          a->sign ^= 1;
 --
 .43.0

-[PULL 15/24] tcg: Use tcg_constant_{i32,i64} with tcg plugins
+[PULL 56/72] target/arm: Use float*_muladd_scalbn
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Use the scalbn interface instead of float_muladd_halve_result.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/plugin-gen.c | 49 +++++++++++++++++++-----------------------
+ target/arm/tcg/helper-a64.c | 6 +++---
-file changed, 22 insertions(+), 27 deletions(-)
+file changed, 3 insertions(+), 3 deletions(-)
-diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
+diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/plugin-gen.c
+--- a/target/arm/tcg/helper-a64.c
-+++ b/accel/tcg/plugin-gen.c
++++ b/target/arm/tcg/helper-a64.c
-@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, float_status *fpst)
-     if (TCG_TARGET_REG_BITS == 32) {
+         (float16_is_infinity(b) && float16_is_zero(a))) {
-         /* mov_i32 */
+         return float16_one_point_five;
-         op = copy_op(begin_op, op, INDEX_op_mov_i32);
+     }
--        /* movi_i32 */
+-    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
--        op = copy_op(begin_op, op, INDEX_op_movi_i32);
++    return float16_muladd_scalbn(a, b, float16_three, -1, 0, fpst);
 +        /* mov_i32 w/ $0 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i32);
      } else {
          /* extu_i32_i64 */
          op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
      return op;
  }
--static TCGOp *copy_movi_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
+ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
--{
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
--    if (TCG_TARGET_REG_BITS == 32) {
+         (float32_is_infinity(b) && float32_is_zero(a))) {
--        /* 2x movi_i32 */
+         return float32_one_point_five;
 -        op = copy_op(begin_op, op, INDEX_op_movi_i32);
 -        op->args[1] = v;
 -
 -        op = copy_op(begin_op, op, INDEX_op_movi_i32);
 -        op->args[1] = v >> 32;
 -    } else {
 -        /* movi_i64 */
 -        op = copy_op(begin_op, op, INDEX_op_movi_i64);
 -        op->args[1] = v;
 -    }
 -    return op;
 -}
 -
  static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
  {
      if (UINTPTR_MAX == UINT32_MAX) {
 -        /* movi_i32 */
 -        op = copy_op(begin_op, op, INDEX_op_movi_i32);
 -        op->args[1] = (uintptr_t)ptr;
 +        /* mov_i32 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i32);
 +        op->args[1] = tcgv_i32_arg(tcg_constant_i32((uintptr_t)ptr));
      } else {
 -        /* movi_i64 */
 -        op = copy_movi_i64(begin_op, op, (uint64_t)(uintptr_t)ptr);
 +        /* mov_i64 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i64);
 +        op->args[1] = tcgv_i64_arg(tcg_constant_i64((uintptr_t)ptr));
      }
-     return op;
+-    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
 +    return float32_muladd_scalbn(a, b, float32_three, -1, 0, fpst);
  }
- static TCGOp *copy_const_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
+ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
- {
+@@ -XXX,XX +XXX,XX @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
--    return copy_movi_i64(begin_op, op, v);
+         (float64_is_infinity(b) && float64_is_zero(a))) {
-+    if (TCG_TARGET_REG_BITS == 32) {
+         return float64_one_point_five;
-+        /* 2x mov_i32 */
+     }
-+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+-    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
-+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v));
++    return float64_muladd_scalbn(a, b, float64_three, -1, 0, fpst);
 +        op = copy_op(begin_op, op, INDEX_op_mov_i32);
 +        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v >> 32));
 +    } else {
 +        /* mov_i64 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i64);
 +        op->args[1] = tcgv_i64_arg(tcg_constant_i64(v));
 +    }
 +    return op;
  }
- static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
+ /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
@@ -XXX,XX +XXX,XX @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
      tcg_debug_assert(type == PLUGIN_GEN_CB_MEM);
 -    /* const_i32 == movi_i32 ("info", so it remains as is) */
 -    op = copy_op(&begin_op, op, INDEX_op_movi_i32);
 +    /* const_i32 == mov_i32 ("info", so it remains as is) */
 +    op = copy_op(&begin_op, op, INDEX_op_mov_i32);
      /* const_ptr */
      op = copy_const_ptr(&begin_op, op, cb->userp);
 --
-.25.1
+.43.0

-New patch
+[PULL 57/72] target/sparc: Use float*_muladd_scalbn
+Use the scalbn interface instead of float_muladd_halve_result.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  target/sparc/helper.h     |  4 +-
  target/sparc/fop_helper.c |  8 ++--
  target/sparc/translate.c  | 80 +++++++++++++++++++++++----------------
 files changed, 54 insertions(+), 38 deletions(-)
 diff --git a/target/sparc/helper.h b/target/sparc/helper.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/helper.h
 +++ b/target/sparc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(faddd, TCG_CALL_NO_WG, f64, env, f64, f64)
  DEF_HELPER_FLAGS_3(fsubd, TCG_CALL_NO_WG, f64, env, f64, f64)
  DEF_HELPER_FLAGS_3(fmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
  DEF_HELPER_FLAGS_3(fdivd, TCG_CALL_NO_WG, f64, env, f64, f64)
 -DEF_HELPER_FLAGS_5(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, i32)
 +DEF_HELPER_FLAGS_6(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, s32, i32)
  DEF_HELPER_FLAGS_3(fnaddd, TCG_CALL_NO_WG, f64, env, f64, f64)
  DEF_HELPER_FLAGS_3(fnmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(fadds, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fsubs, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fdivs, TCG_CALL_NO_WG, f32, env, f32, f32)
 -DEF_HELPER_FLAGS_5(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, i32)
 +DEF_HELPER_FLAGS_6(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, s32, i32)
  DEF_HELPER_FLAGS_3(fnadds, TCG_CALL_NO_WG, f32, env, f32, f32)
  DEF_HELPER_FLAGS_3(fnmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
 diff --git a/target/sparc/fop_helper.c b/target/sparc/fop_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/fop_helper.c
 +++ b/target/sparc/fop_helper.c
@@ -XXX,XX +XXX,XX @@ Int128 helper_fsqrtq(CPUSPARCState *env, Int128 src)
  }
  float32 helper_fmadds(CPUSPARCState *env, float32 s1,
 -                      float32 s2, float32 s3, uint32_t op)
 +                      float32 s2, float32 s3, int32_t sc, uint32_t op)
  {
 -    float32 ret = float32_muladd(s1, s2, s3, op, &env->fp_status);
 +    float32 ret = float32_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
      check_ieee_exceptions(env, GETPC());
      return ret;
  }
  float64 helper_fmaddd(CPUSPARCState *env, float64 s1,
 -                      float64 s2, float64 s3, uint32_t op)
 +                      float64 s2, float64 s3, int32_t sc, uint32_t op)
  {
 -    float64 ret = float64_muladd(s1, s2, s3, op, &env->fp_status);
 +    float64 ret = float64_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
      check_ieee_exceptions(env, GETPC());
      return ret;
  }
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/translate.c
 +++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_op_fabsq(TCGv_i128 dst, TCGv_i128 src)
  static void gen_op_fmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, z);
  }
  static void gen_op_fmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, z);
  }
  static void gen_op_fmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    int op = float_muladd_negate_c;
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    int op = float_muladd_negate_c;
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    int op = float_muladd_negate_c | float_muladd_negate_result;
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
 +                                   float_muladd_negate_result);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    int op = float_muladd_negate_c | float_muladd_negate_result;
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
 +                                   float_muladd_negate_result);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
  {
 -    int op = float_muladd_negate_result;
 -    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
  }
  static void gen_op_fnmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
  {
 -    int op = float_muladd_negate_result;
 -    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
 +    TCGv_i32 z = tcg_constant_i32(0);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
  }
  /* Use muladd to compute (1 * src1) + src2 / 2 with one rounding. */
  static void gen_op_fhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
  {
 -    TCGv_i32 one = tcg_constant_i32(float32_one);
 -    int op = float_muladd_halve_result;
 -    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i32 fone = tcg_constant_i32(float32_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(0);
 +    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
  {
 -    TCGv_i64 one = tcg_constant_i64(float64_one);
 -    int op = float_muladd_halve_result;
 -    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i64 fone = tcg_constant_i64(float64_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(0);
 +    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
  }
  /* Use muladd to compute (1 * src1) - src2 / 2 with one rounding. */
  static void gen_op_fhsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
  {
 -    TCGv_i32 one = tcg_constant_i32(float32_one);
 -    int op = float_muladd_negate_c | float_muladd_halve_result;
 -    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i32 fone = tcg_constant_i32(float32_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fhsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
  {
 -    TCGv_i64 one = tcg_constant_i64(float64_one);
 -    int op = float_muladd_negate_c | float_muladd_halve_result;
 -    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i64 fone = tcg_constant_i64(float64_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
 +    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
  }
  /* Use muladd to compute -((1 * src1) + src2 / 2) with one rounding. */
  static void gen_op_fnhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
  {
 -    TCGv_i32 one = tcg_constant_i32(float32_one);
 -    int op = float_muladd_negate_result | float_muladd_halve_result;
 -    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i32 fone = tcg_constant_i32(float32_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fnhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
  {
 -    TCGv_i64 one = tcg_constant_i64(float64_one);
 -    int op = float_muladd_negate_result | float_muladd_halve_result;
 -    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
 +    TCGv_i64 fone = tcg_constant_i64(float64_one);
 +    TCGv_i32 mone = tcg_constant_i32(-1);
 +    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
 +    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
  }
  static void gen_op_fpexception_im(DisasContext *dc, int ftt)
 --
 .43.0

-[PULL 20/24] tcg/i386: Use tcg_constant_vec with tcg vec expanders
+[PULL 58/72] softfloat: Remove float_muladd_halve_result
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+All uses have been convered to float*_muladd_scalbn.
 Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/i386/tcg-target.c.inc | 26 +++++++++++++-------------
+ include/fpu/softfloat.h   | 3 ---
-file changed, 13 insertions(+), 13 deletions(-)
+ fpu/softfloat.c           | 6 ------
  fpu/softfloat-parts.c.inc | 4 ----
 files changed, 13 deletions(-)
-diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/i386/tcg-target.c.inc
+--- a/include/fpu/softfloat.h
-+++ b/tcg/i386/tcg-target.c.inc
++++ b/include/fpu/softfloat.h
-@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
+@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
- static void expand_vec_mul(TCGType type, unsigned vece,
+ | Using these differs from negating an input or output before calling
-                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
+ | the muladd function in that this means that a NaN doesn't have its
- {
+ | sign bit inverted before it is propagated.
--    TCGv_vec t1, t2, t3, t4;
+-| We also support halving the result before rounding, as a special
-+    TCGv_vec t1, t2, t3, t4, zero;
+-| case to support the ARM fused-sqrt-step instruction FRSQRTS.
+ *----------------------------------------------------------------------------*/
-     tcg_debug_assert(vece == MO_8);
+ enum {
+     float_muladd_negate_c = 1,
-@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
+     float_muladd_negate_product = 2,
-     case TCG_TYPE_V64:
+     float_muladd_negate_result = 4,
-         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+-    float_muladd_halve_result = 8,
-         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ };
--        tcg_gen_dup16i_vec(t2, 0);
-+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
+ /*----------------------------------------------------------------------------
-         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+diff --git a/fpu/softfloat.c b/fpu/softfloat.c
--                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+index XXXXXXX..XXXXXXX 100644
-+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
+--- a/fpu/softfloat.c
-         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
++++ b/fpu/softfloat.c
--                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
+@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
-+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
+     if (unlikely(!can_use_fpu(s))) {
-         tcg_gen_mul_vec(MO_16, t1, t1, t2);
+         goto soft;
-         tcg_gen_shri_vec(MO_16, t1, t1, 8);
+     }
-         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+-    if (unlikely(flags & float_muladd_halve_result)) {
-@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
+-        goto soft;
-         t2 = tcg_temp_new_vec(type);
+-    }
-         t3 = tcg_temp_new_vec(type);
-         t4 = tcg_temp_new_vec(type);
+     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
--        tcg_gen_dup16i_vec(t4, 0);
+     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
-+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
+@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
-         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+     if (unlikely(!can_use_fpu(s))) {
--                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+         goto soft;
-+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
+     }
-         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+-    if (unlikely(flags & float_muladd_halve_result)) {
--                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+-        goto soft;
-+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
+-    }
-         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
--                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
-+                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
+     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
-         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
--                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+index XXXXXXX..XXXXXXX 100644
-+                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
+--- a/fpu/softfloat-parts.c.inc
-         tcg_gen_mul_vec(MO_16, t1, t1, t2);
++++ b/fpu/softfloat-parts.c.inc
-         tcg_gen_mul_vec(MO_16, t3, t3, t4);
+@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
-         tcg_gen_shri_vec(MO_16, t1, t1, 8);
+     a->exp = p_widen.exp;
-@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
-         NEED_UMIN = 8,
+  return_normal:
-         NEED_UMAX = 16,
+-    /* TODO: Replace all use of float_muladd_halve_result with scale. */
-     };
+-    if (flags & float_muladd_halve_result) {
--    TCGv_vec t1, t2;
+-        a->exp -= 1;
-+    TCGv_vec t1, t2, t3;
+-    }
-     uint8_t fixup;
+     a->exp += scale;
+  finish_sign:
-     switch (cond) {
+     if (flags & float_muladd_negate_result) {
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
      } else if (fixup & NEED_BIAS) {
          t1 = tcg_temp_new_vec(type);
          t2 = tcg_temp_new_vec(type);
 -        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
 -        tcg_gen_sub_vec(vece, t1, v1, t2);
 -        tcg_gen_sub_vec(vece, t2, v2, t2);
 +        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
 +        tcg_gen_sub_vec(vece, t1, v1, t3);
 +        tcg_gen_sub_vec(vece, t2, v2, t3);
          v1 = t1;
          v2 = t2;
          cond = tcg_signed_cond(cond);
 --
-.25.1
+.43.0

-[PULL 23/24] tcg/aarch64: Use tcg_constant_vec with tcg vec expanders
+[PULL 59/72] softfloat: Add float_round_nearest_even_max
-Improve rotrv_vec to reduce "t1 = -v2, t2 = t1 + c" to
+This rounding mode is used by Hexagon.
 "t1 = -v2, t2 = c - v2".  This avoids a serial dependency
 between t1 and t2.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.c.inc | 10 +++++-----
+ include/fpu/softfloat-types.h | 2 ++
-file changed, 5 insertions(+), 5 deletions(-)
+ fpu/softfloat-parts.c.inc     | 3 +++
 files changed, 5 insertions(+)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+--- a/include/fpu/softfloat-types.h
-+++ b/tcg/aarch64/tcg-target.c.inc
++++ b/include/fpu/softfloat-types.h
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ typedef enum __attribute__((__packed__)) {
-                        TCGArg a0, ...)
+     float_round_to_odd       = 5,
- {
+     /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
-     va_list va;
+     float_round_to_odd_inf   = 6,
--    TCGv_vec v0, v1, v2, t1, t2;
++    /* Not an IEEE rounding mode: round to nearest even, overflow to max */
-+    TCGv_vec v0, v1, v2, t1, t2, c1;
++    float_round_nearest_even_max = 7,
-     TCGArg a2;
+ } FloatRoundMode;
-     va_start(va, a0);
+ /*
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
+index XXXXXXX..XXXXXXX 100644
-     case INDEX_op_rotlv_vec:
+--- a/fpu/softfloat-parts.c.inc
-         t1 = tcg_temp_new_vec(type);
++++ b/fpu/softfloat-parts.c.inc
--        tcg_gen_dupi_vec(vece, t1, 8 << vece);
+@@ -XXX,XX +XXX,XX @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
--        tcg_gen_sub_vec(vece, t1, v2, t1);
+     int exp, flags = 0;
-+        c1 = tcg_constant_vec(type, vece, 8 << vece);
-+        tcg_gen_sub_vec(vece, t1, v2, c1);
+     switch (s->float_rounding_mode) {
-         /* Right shifts are negative left shifts for AArch64.  */
++    case float_round_nearest_even_max:
-         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
++        overflow_norm = true;
-                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
++        /* fall through */
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+     case float_round_nearest_even:
-     case INDEX_op_rotrv_vec:
+         if (N > 64 && frac_lsb == 0) {
-         t1 = tcg_temp_new_vec(type);
+             inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
          t2 = tcg_temp_new_vec(type);
 +        c1 = tcg_constant_vec(type, vece, 8 << vece);
          tcg_gen_neg_vec(vece, t1, v2);
 -        tcg_gen_dupi_vec(vece, t2, 8 << vece);
 -        tcg_gen_add_vec(vece, t2, t1, t2);
 +        tcg_gen_sub_vec(vece, t2, c1, v2);
          /* Right shifts are negative left shifts for AArch64.  */
          vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
                    tcgv_vec_arg(v1), tcgv_vec_arg(t1));
 --
-.25.1
+.43.0

-[PULL 05/24] tcg: Expand TCGTemp.val to 64-bits
+[PULL 60/72] softfloat: Add float_muladd_suppress_add_product_zero
-This will reduce the differences between 32-bit and 64-bit hosts,
+Certain Hexagon instructions suppress changes to the result
-allowing full 64-bit constants to be created with the same interface.
+when the product of fma() is a true zero.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h | 2 +-
+ include/fpu/softfloat.h   | 5 +++++
- tcg/tcg.c         | 2 +-
+ fpu/softfloat.c           | 3 +++
-files changed, 2 insertions(+), 2 deletions(-)
+ fpu/softfloat-parts.c.inc | 4 +++-
 files changed, 11 insertions(+), 1 deletion(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/include/fpu/softfloat.h
-+++ b/include/tcg/tcg.h
++++ b/include/fpu/softfloat.h
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGTemp {
+@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
-     unsigned int mem_allocated:1;
+ | Using these differs from negating an input or output before calling
-     unsigned int temp_allocated:1;
+ | the muladd function in that this means that a NaN doesn't have its
+ | sign bit inverted before it is propagated.
--    tcg_target_long val;
++|
-+    int64_t val;
++| With float_muladd_suppress_add_product_zero, if A or B is zero
-     struct TCGTemp *mem_base;
++| such that the product is a true zero, then return C without addition.
-     intptr_t mem_offset;
++| This preserves the sign of C when C is +/- 0.  Used for Hexagon.
-     const char *name;
+ *----------------------------------------------------------------------------*/
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+ enum {
      float_muladd_negate_c = 1,
      float_muladd_negate_product = 2,
      float_muladd_negate_result = 4,
 +    float_muladd_suppress_add_product_zero = 8,
  };
  /*----------------------------------------------------------------------------
 diff --git a/fpu/softfloat.c b/fpu/softfloat.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/fpu/softfloat.c
-+++ b/tcg/tcg.c
++++ b/fpu/softfloat.c
-@@ -XXX,XX +XXX,XX @@ static void dump_regs(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
-                    tcg_target_reg_names[ts->mem_base->reg]);
+     if (unlikely(!can_use_fpu(s))) {
-             break;
+         goto soft;
-         case TEMP_VAL_CONST:
+     }
--            printf("$0x%" TCG_PRIlx, ts->val);
++    if (unlikely(flags & float_muladd_suppress_add_product_zero)) {
-+            printf("$0x%" PRIx64, ts->val);
++        goto soft;
-             break;
++    }
-         case TEMP_VAL_DEAD:
-             printf("D");
+     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
      if (unlikely(!f32_is_zon3(ua, ub, uc))) {
 diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/fpu/softfloat-parts.c.inc
 +++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
              goto return_normal;
          }
          if (c->cls == float_class_zero) {
 -            if (a->sign != c->sign) {
 +            if (flags & float_muladd_suppress_add_product_zero) {
 +                a->sign = c->sign;
 +            } else if (a->sign != c->sign) {
                  goto return_sub_zero;
              }
              goto return_zero;
 --
-.25.1
+.43.0

-New patch
+[PULL 61/72] target/hexagon: Use float32_mul in helper_sfmpy
+There are no special cases for this instruction.
+Remove internal_mpyf as unused.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/fma_emu.h   | 1 -
+ target/hexagon/fma_emu.c   | 8 --------
+ target/hexagon/op_helper.c | 2 +-
+files changed, 1 insertion(+), 10 deletions(-)
+diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.h
++++ b/target/hexagon/fma_emu.h
+@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32);
+ float32 infinite_float32(uint8_t sign);
+ float32 internal_fmafx(float32 a, float32 b, float32 c,
+                        int scale, float_status *fp_status);
+-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status);
+ float64 internal_mpyhh(float64 a, float64 b,
+                        unsigned long long int accumulated,
+                        float_status *fp_status);
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.c
++++ b/target/hexagon/fma_emu.c
+@@ -XXX,XX +XXX,XX @@ float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
+     return accum_round_float32(result, fp_status);
+ }
+-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status)
+-{
+-    if (float32_is_zero(a) || float32_is_zero(b)) {
+-        return float32_mul(a, b, fp_status);
+-    }
+-    return internal_fmafx(a, b, float32_zero, 0, fp_status);
+-}
+-
+ float64 internal_mpyhh(float64 a, float64 b,
+                       unsigned long long int accumulated,
+                       float_status *fp_status)
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/op_helper.c
++++ b/target/hexagon/op_helper.c
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sfmpy)(CPUHexagonState *env, float32 RsV, float32 RtV)
+ {
+     float32 RdV;
+     arch_fpop_start(env);
+-    RdV = internal_mpyf(RsV, RtV, &env->fp_status);
++    RdV = float32_mul(RsV, RtV, &env->fp_status);
+     arch_fpop_end(env);
+     return RdV;
+ }
+--
+.43.0

-New patch
+[PULL 62/72] target/hexagon: Use float32_muladd for helper_sffma
+There are no special cases for this instruction.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/op_helper.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/op_helper.c
++++ b/target/hexagon/op_helper.c
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV,
+                       float32 RsV, float32 RtV)
+ {
+     arch_fpop_start(env);
+-    RxV = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status);
++    RxV = float32_muladd(RsV, RtV, RxV, 0, &env->fp_status);
+     arch_fpop_end(env);
+     return RxV;
+ }
+--
+.43.0

-New patch
+[PULL 63/72] target/hexagon: Use float32_muladd for helper_sffms
+There are no special cases for this instruction.  Since hexagon
+always uses default-nan mode, explicitly negating the first
+input is unnecessary.  Use float_muladd_negate_product instead.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/op_helper.c | 5 ++---
+file changed, 2 insertions(+), 3 deletions(-)
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/op_helper.c
++++ b/target/hexagon/op_helper.c
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
+ float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV,
+                       float32 RsV, float32 RtV)
+ {
+-    float32 neg_RsV;
+     arch_fpop_start(env);
+-    neg_RsV = float32_set_sign(RsV, float32_is_neg(RsV) ? 0 : 1);
+-    RxV = internal_fmafx(neg_RsV, RtV, RxV, 0, &env->fp_status);
++    RxV = float32_muladd(RsV, RtV, RxV, float_muladd_negate_product,
++                         &env->fp_status);
+     arch_fpop_end(env);
+     return RxV;
+ }
+--
+.43.0

-New patch
+[PULL 64/72] target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
+This instruction has a special case that 0 * x + c returns c
+without the normal sign folding that comes with 0 + -0.
+Use the new float_muladd_suppress_add_product_zero to
+describe this.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/op_helper.c | 11 +++--------
+file changed, 3 insertions(+), 8 deletions(-)
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/op_helper.c
++++ b/target/hexagon/op_helper.c
+@@ -XXX,XX +XXX,XX @@ static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
+ float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
+                          float32 RsV, float32 RtV, float32 PuV)
+ {
+-    size4s_t tmp;
+     arch_fpop_start(env);
+-    RxV = check_nan(RxV, RxV, &env->fp_status);
+-    RxV = check_nan(RxV, RsV, &env->fp_status);
+-    RxV = check_nan(RxV, RtV, &env->fp_status);
+-    tmp = internal_fmafx(RsV, RtV, RxV, fSXTN(8, 64, PuV), &env->fp_status);
+-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
+-        RxV = tmp;
+-    }
++    RxV = float32_muladd_scalbn(RsV, RtV, RxV, fSXTN(8, 64, PuV),
++                                float_muladd_suppress_add_product_zero,
++                                &env->fp_status);
+     arch_fpop_end(env);
+     return RxV;
+ }
+--
+.43.0

-[PULL 14/24] tcg: Use tcg_constant_{i32,i64} with tcg int expanders
+[PULL 65/72] target/hexagon: Use float32_muladd for helper_sffm[as]_lib
+There are multiple special cases for this instruction.
+(1) The saturate to normal maximum instead of overflow to infinity is
+    handled by the new float_round_nearest_even_max rounding mode.
+(2) The 0 * n + c special case is handled by the new
+    float_muladd_suppress_add_product_zero flag.
+(3) The Inf - Inf -> 0 special case can be detected after the fact
+    by examining float_flag_invalid_isi.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op.h |  13 +--
+ target/hexagon/op_helper.c | 105 +++++++++----------------------------
- tcg/tcg-op.c         | 227 ++++++++++++++++++++-----------------------
+file changed, 26 insertions(+), 79 deletions(-)
 files changed, 109 insertions(+), 131 deletions(-)
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
+diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/target/hexagon/op_helper.c
-+++ b/include/tcg/tcg-op.h
++++ b/target/hexagon/op_helper.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar);
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV,
+     return RxV;
  /* 32 bit ops */
 +void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
  void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
  void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
  void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
      }
  }
--static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
+-static bool is_zero_prod(float32 a, float32 b)
 -{
--    tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
+-    return ((float32_is_zero(a) && is_finite(b)) ||
 -            (float32_is_zero(b) && is_finite(a)));
 -}
 -
- static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
+-static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
                                      tcg_target_long offset)
  {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
  /* 64 bit ops */
 +void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
  void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
  void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
  void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
      }
  }
 -static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
 -{
--    tcg_gen_op2i_i64(INDEX_op_movi_i64, ret, arg);
+-    float32 ret = dst;
 -    if (float32_is_any_nan(x)) {
 -        if (extract32(x, 22, 1) == 0) {
 -            float_raise(float_flag_invalid, fp_status);
 -        }
 -        ret = make_float32(0xffffffff);    /* nan */
 -    }
 -    return ret;
 -}
 -
- static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
+ float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
-                                     tcg_target_long offset)
+                          float32 RsV, float32 RtV, float32 PuV)
  {
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
+@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV,
+     return RxV;
- void tcg_gen_discard_i64(TCGv_i64 arg);
+ }
- void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
--void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
+-static bool is_inf_prod(int32_t a, int32_t b)
- void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
++static float32 do_sffma_lib(CPUHexagonState *env, float32 RxV,
- void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
++                            float32 RsV, float32 RtV, int negate)
- void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
+ {
-diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
+-    return (float32_is_infinity(a) && float32_is_infinity(b)) ||
-index XXXXXXX..XXXXXXX 100644
+-           (float32_is_infinity(a) && is_finite(b) && !float32_is_zero(b)) ||
---- a/tcg/tcg-op.c
+-           (float32_is_infinity(b) && is_finite(a) && !float32_is_zero(a));
-+++ b/tcg/tcg-op.c
++    int flags;
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar mb_type)
  /* 32 bit ops */
 +void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
 +{
 +    tcg_gen_mov_i32(ret, tcg_constant_i32(arg));
 +}
 +
- void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
++    arch_fpop_start(env);
 +
 +    set_float_rounding_mode(float_round_nearest_even_max, &env->fp_status);
 +    RxV = float32_muladd(RsV, RtV, RxV,
 +                         negate | float_muladd_suppress_add_product_zero,
 +                         &env->fp_status);
 +
 +    flags = get_float_exception_flags(&env->fp_status);
 +    if (flags) {
 +        /* Flags are suppressed by this instruction. */
 +        set_float_exception_flags(0, &env->fp_status);
 +
 +        /* Return 0 for Inf - Inf. */
 +        if (flags & float_flag_invalid_isi) {
 +            RxV = 0;
 +        }
 +    }
 +
 +    arch_fpop_end(env);
 +    return RxV;
  }
  float32 HELPER(sffma_lib)(CPUHexagonState *env, float32 RxV,
                            float32 RsV, float32 RtV)
  {
-     /* some cases can be optimized here */
+-    bool infinp;
-     if (arg2 == 0) {
+-    bool infminusinf;
-         tcg_gen_mov_i32(ret, arg1);
+-    float32 tmp;
-     } else {
+-
--        TCGv_i32 t0 = tcg_const_i32(arg2);
+-    arch_fpop_start(env);
--        tcg_gen_add_i32(ret, arg1, t0);
+-    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
--        tcg_temp_free_i32(t0);
+-    infminusinf = float32_is_infinity(RxV) &&
-+        tcg_gen_add_i32(ret, arg1, tcg_constant_i32(arg2));
+-                  is_inf_prod(RsV, RtV) &&
-     }
+-                  (fGETBIT(31, RsV ^ RxV ^ RtV) != 0);
 -    infinp = float32_is_infinity(RxV) ||
 -             float32_is_infinity(RtV) ||
 -             float32_is_infinity(RsV);
 -    RxV = check_nan(RxV, RxV, &env->fp_status);
 -    RxV = check_nan(RxV, RsV, &env->fp_status);
 -    RxV = check_nan(RxV, RtV, &env->fp_status);
 -    tmp = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status);
 -    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
 -        RxV = tmp;
 -    }
 -    set_float_exception_flags(0, &env->fp_status);
 -    if (float32_is_infinity(RxV) && !infinp) {
 -        RxV = RxV - 1;
 -    }
 -    if (infminusinf) {
 -        RxV = 0;
 -    }
 -    arch_fpop_end(env);
 -    return RxV;
 +    return do_sffma_lib(env, RxV, RsV, RtV, 0);
  }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
+ float32 HELPER(sffms_lib)(CPUHexagonState *env, float32 RxV,
-         /* Don't recurse with tcg_gen_neg_i32.  */
+                           float32 RsV, float32 RtV)
-         tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg2);
+ {
-     } else {
+-    bool infinp;
--        TCGv_i32 t0 = tcg_const_i32(arg1);
+-    bool infminusinf;
--        tcg_gen_sub_i32(ret, t0, arg2);
+-    float32 tmp;
--        tcg_temp_free_i32(t0);
+-
-+        tcg_gen_sub_i32(ret, tcg_constant_i32(arg1), arg2);
+-    arch_fpop_start(env);
-     }
+-    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
 -    infminusinf = float32_is_infinity(RxV) &&
 -                  is_inf_prod(RsV, RtV) &&
 -                  (fGETBIT(31, RsV ^ RxV ^ RtV) == 0);
 -    infinp = float32_is_infinity(RxV) ||
 -             float32_is_infinity(RtV) ||
 -             float32_is_infinity(RsV);
 -    RxV = check_nan(RxV, RxV, &env->fp_status);
 -    RxV = check_nan(RxV, RsV, &env->fp_status);
 -    RxV = check_nan(RxV, RtV, &env->fp_status);
 -    float32 minus_RsV = float32_sub(float32_zero, RsV, &env->fp_status);
 -    tmp = internal_fmafx(minus_RsV, RtV, RxV, 0, &env->fp_status);
 -    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
 -        RxV = tmp;
 -    }
 -    set_float_exception_flags(0, &env->fp_status);
 -    if (float32_is_infinity(RxV) && !infinp) {
 -        RxV = RxV - 1;
 -    }
 -    if (infminusinf) {
 -        RxV = 0;
 -    }
 -    arch_fpop_end(env);
 -    return RxV;
 +    return do_sffma_lib(env, RxV, RsV, RtV, float_muladd_negate_product);
  }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+ float64 HELPER(dfmpyfix)(CPUHexagonState *env, float64 RssV, float64 RttV)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_sub_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_sub_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
  void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
  {
 -    TCGv_i32 t0;
      /* Some cases can be optimized here.  */
      switch (arg2) {
      case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
          }
          break;
      }
 -    t0 = tcg_const_i32(arg2);
 -    tcg_gen_and_i32(ret, arg1, t0);
 -    tcg_temp_free_i32(t0);
 +
 +    tcg_gen_and_i32(ret, arg1, tcg_constant_i32(arg2));
  }
  void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_or_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_or_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
          /* Don't recurse with tcg_gen_not_i32.  */
          tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_xor_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_xor_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_shl_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_shl_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_shr_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_shr_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_sar_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_sar_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *l)
      if (cond == TCG_COND_ALWAYS) {
          tcg_gen_br(l);
      } else if (cond != TCG_COND_NEVER) {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_brcond_i32(cond, arg1, t0, l);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_brcond_i32(cond, arg1, tcg_constant_i32(arg2), l);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
  void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
                            TCGv_i32 arg1, int32_t arg2)
  {
 -    TCGv_i32 t0 = tcg_const_i32(arg2);
 -    tcg_gen_setcond_i32(cond, ret, arg1, t0);
 -    tcg_temp_free_i32(t0);
 +    tcg_gen_setcond_i32(cond, ret, arg1, tcg_constant_i32(arg2));
  }
  void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      } else if (is_power_of_2(arg2)) {
          tcg_gen_shli_i32(ret, arg1, ctz32(arg2));
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_mul_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_mul_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
  void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
  {
 -    TCGv_i32 t = tcg_const_i32(arg2);
 -    tcg_gen_clz_i32(ret, arg1, t);
 -    tcg_temp_free_i32(t);
 +    tcg_gen_clz_i32(ret, arg1, tcg_constant_i32(arg2));
  }
  void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
              tcg_gen_clzi_i32(t, t, 32);
              tcg_gen_xori_i32(t, t, 31);
          }
 -        z = tcg_const_i32(0);
 +        z = tcg_constant_i32(0);
          tcg_gen_movcond_i32(TCG_COND_EQ, ret, arg1, z, arg2, t);
          tcg_temp_free_i32(t);
 -        tcg_temp_free_i32(z);
      } else {
          gen_helper_ctz_i32(ret, arg1, arg2);
      }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
          tcg_gen_ctpop_i32(ret, t);
          tcg_temp_free_i32(t);
      } else {
 -        TCGv_i32 t = tcg_const_i32(arg2);
 -        tcg_gen_ctz_i32(ret, arg1, t);
 -        tcg_temp_free_i32(t);
 +        tcg_gen_ctz_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else if (TCG_TARGET_HAS_rot_i32) {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_rotl_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_rotl_i32(ret, arg1, tcg_constant_i32(arg2));
      } else {
          TCGv_i32 t0, t1;
          t0 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
          tcg_gen_andi_i32(ret, arg, (1u << len) - 1);
      } else if (TCG_TARGET_HAS_deposit_i32
                 && TCG_TARGET_deposit_i32_valid(ofs, len)) {
 -        TCGv_i32 zero = tcg_const_i32(0);
 +        TCGv_i32 zero = tcg_constant_i32(0);
          tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, zero, arg, ofs, len);
 -        tcg_temp_free_i32(zero);
      } else {
          /* To help two-operand hosts we prefer to zero-extend first,
             which allows ARG to stay live.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
      } else {
          TCGv_i32 t0 = tcg_temp_new_i32();
          TCGv_i32 t1 = tcg_temp_new_i32();
 -        TCGv_i32 t2 = tcg_const_i32(0x00ff00ff);
 +        TCGv_i32 t2 = tcg_constant_i32(0x00ff00ff);
                                          /* arg = abcd */
          tcg_gen_shri_i32(t0, arg, 8);   /*  t0 = .abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
          tcg_temp_free_i32(t0);
          tcg_temp_free_i32(t1);
 -        tcg_temp_free_i32(t2);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_discard_i64(TCGv_i64 arg)
  void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
  {
 -    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
 -    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
 +    TCGTemp *ts = tcgv_i64_temp(arg);
 +
 +    /* Canonicalize TCGv_i64 TEMP_CONST into TCGv_i32 TEMP_CONST. */
 +    if (ts->kind == TEMP_CONST) {
 +        tcg_gen_movi_i64(ret, ts->val);
 +    } else {
 +        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
 +        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
 +    }
  }
  void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
      tcg_temp_free_i64(t0);
      tcg_temp_free_i32(t1);
  }
 +
 +#else
 +
 +void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
 +{
 +    tcg_gen_mov_i64(ret, tcg_constant_i64(arg));
 +}
 +
  #endif /* TCG_TARGET_REG_SIZE == 32 */
  void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      /* some cases can be optimized here */
      if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
 +    } else if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_add_i64(ret, arg1, tcg_constant_i64(arg2));
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_add_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
 +                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
      if (arg1 == 0 && TCG_TARGET_HAS_neg_i64) {
          /* Don't recurse with tcg_gen_neg_i64.  */
          tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg2);
 +    } else if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_sub_i64(ret, tcg_constant_i64(arg1), arg2);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg1);
 -        tcg_gen_sub_i64(ret, t0, arg2);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
 +                         tcg_constant_i32(arg1), tcg_constant_i32(arg1 >> 32),
 +                         TCGV_LOW(arg2), TCGV_HIGH(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      /* some cases can be optimized here */
      if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
 +    } else if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_sub_i64(ret, arg1, tcg_constant_i64(arg2));
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_sub_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
 +                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
      }
  }
  void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
  {
 -    TCGv_i64 t0;
 -
      if (TCG_TARGET_REG_BITS == 32) {
          tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
          tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
          }
          break;
      }
 -    t0 = tcg_const_i64(arg2);
 -    tcg_gen_and_i64(ret, arg1, t0);
 -    tcg_temp_free_i64(t0);
 +
 +    tcg_gen_and_i64(ret, arg1, tcg_constant_i64(arg2));
  }
  void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_or_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_or_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
          /* Don't recurse with tcg_gen_not_i64.  */
          tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_xor_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_xor_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_shl_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_shl_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_shr_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_shr_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_sar_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_sar_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *l)
  void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *l)
  {
 -    if (cond == TCG_COND_ALWAYS) {
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_brcond_i64(cond, arg1, tcg_constant_i64(arg2), l);
 +    } else if (cond == TCG_COND_ALWAYS) {
          tcg_gen_br(l);
      } else if (cond != TCG_COND_NEVER) {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_brcond_i64(cond, arg1, t0, l);
 -        tcg_temp_free_i64(t0);
 +        l->refs++;
 +        tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
 +                          TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                          tcg_constant_i32(arg2),
 +                          tcg_constant_i32(arg2 >> 32),
 +                          cond, label_arg(l));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
  void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
                            TCGv_i64 arg1, int64_t arg2)
  {
 -    TCGv_i64 t0 = tcg_const_i64(arg2);
 -    tcg_gen_setcond_i64(cond, ret, arg1, t0);
 -    tcg_temp_free_i64(t0);
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_setcond_i64(cond, ret, arg1, tcg_constant_i64(arg2));
 +    } else if (cond == TCG_COND_ALWAYS) {
 +        tcg_gen_movi_i64(ret, 1);
 +    } else if (cond == TCG_COND_NEVER) {
 +        tcg_gen_movi_i64(ret, 0);
 +    } else {
 +        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
 +                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                         tcg_constant_i32(arg2),
 +                         tcg_constant_i32(arg2 >> 32), cond);
 +        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
 +    }
  }
  void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
      } else {
          TCGv_i64 t0 = tcg_temp_new_i64();
          TCGv_i64 t1 = tcg_temp_new_i64();
 -        TCGv_i64 t2 = tcg_const_i64(0x00ff00ff);
 +        TCGv_i64 t2 = tcg_constant_i64(0x00ff00ff);
                                          /* arg = ....abcd */
          tcg_gen_shri_i64(t0, arg, 8);   /*  t0 = .....abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
          tcg_temp_free_i64(t0);
          tcg_temp_free_i64(t1);
 -        tcg_temp_free_i64(t2);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
      if (TCG_TARGET_REG_BITS == 32
          && TCG_TARGET_HAS_clz_i32
          && arg2 <= 0xffffffffu) {
 -        TCGv_i32 t = tcg_const_i32((uint32_t)arg2 - 32);
 -        tcg_gen_clz_i32(t, TCGV_LOW(arg1), t);
 +        TCGv_i32 t = tcg_temp_new_i32();
 +        tcg_gen_clzi_i32(t, TCGV_LOW(arg1), arg2 - 32);
          tcg_gen_addi_i32(t, t, 32);
          tcg_gen_clz_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), t);
          tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
          tcg_temp_free_i32(t);
      } else {
 -        TCGv_i64 t = tcg_const_i64(arg2);
 -        tcg_gen_clz_i64(ret, arg1, t);
 -        tcg_temp_free_i64(t);
 +        TCGv_i64 t0 = tcg_const_i64(arg2);
 +        tcg_gen_clz_i64(ret, arg1, t0);
 +        tcg_temp_free_i64(t0);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
              tcg_gen_clzi_i64(t, t, 64);
              tcg_gen_xori_i64(t, t, 63);
          }
 -        z = tcg_const_i64(0);
 +        z = tcg_constant_i64(0);
          tcg_gen_movcond_i64(TCG_COND_EQ, ret, arg1, z, arg2, t);
          tcg_temp_free_i64(t);
          tcg_temp_free_i64(z);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
      if (TCG_TARGET_REG_BITS == 32
          && TCG_TARGET_HAS_ctz_i32
          && arg2 <= 0xffffffffu) {
 -        TCGv_i32 t32 = tcg_const_i32((uint32_t)arg2 - 32);
 -        tcg_gen_ctz_i32(t32, TCGV_HIGH(arg1), t32);
 +        TCGv_i32 t32 = tcg_temp_new_i32();
 +        tcg_gen_ctzi_i32(t32, TCGV_HIGH(arg1), arg2 - 32);
          tcg_gen_addi_i32(t32, t32, 32);
          tcg_gen_ctz_i32(TCGV_LOW(ret), TCGV_LOW(arg1), t32);
          tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
          tcg_gen_ctpop_i64(ret, t);
          tcg_temp_free_i64(t);
      } else {
 -        TCGv_i64 t64 = tcg_const_i64(arg2);
 -        tcg_gen_ctz_i64(ret, arg1, t64);
 -        tcg_temp_free_i64(t64);
 +        TCGv_i64 t0 = tcg_const_i64(arg2);
 +        tcg_gen_ctz_i64(ret, arg1, t0);
 +        tcg_temp_free_i64(t0);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else if (TCG_TARGET_HAS_rot_i64) {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_rotl_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_rotl_i64(ret, arg1, tcg_constant_i64(arg2));
      } else {
          TCGv_i64 t0, t1;
          t0 = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
          tcg_gen_andi_i64(ret, arg, (1ull << len) - 1);
      } else if (TCG_TARGET_HAS_deposit_i64
                 && TCG_TARGET_deposit_i64_valid(ofs, len)) {
 -        TCGv_i64 zero = tcg_const_i64(0);
 +        TCGv_i64 zero = tcg_constant_i64(0);
          tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, zero, arg, ofs, len);
 -        tcg_temp_free_i64(zero);
      } else {
          if (TCG_TARGET_REG_BITS == 32) {
              if (ofs >= 32) {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
  #ifdef CONFIG_SOFTMMU
          {
 -            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
 -            gen(retv, cpu_env, addr, cmpv, newv, oi);
 -            tcg_temp_free_i32(oi);
 +            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
 +            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
          }
  #else
          gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
  #ifdef CONFIG_SOFTMMU
          {
 -            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
 -            gen(retv, cpu_env, addr, cmpv, newv, oi);
 -            tcg_temp_free_i32(oi);
 +            TCGMemOpIdx oi = make_memop_idx(memop, idx);
 +            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
          }
  #else
          gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
  #ifdef CONFIG_SOFTMMU
      {
 -        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
 -        gen(ret, cpu_env, addr, val, oi);
 -        tcg_temp_free_i32(oi);
 +        TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
 +        gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
      }
  #else
      gen(ret, cpu_env, addr, val);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
  #ifdef CONFIG_SOFTMMU
          {
 -            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
 -            gen(ret, cpu_env, addr, val, oi);
 -            tcg_temp_free_i32(oi);
 +            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
 +            gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
          }
  #else
          gen(ret, cpu_env, addr, val);
 --
-.25.1
+.43.0

-New patch
+[PULL 66/72] target/hexagon: Remove internal_fmafx
+The function is now unused.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/fma_emu.h |   2 -
+ target/hexagon/fma_emu.c | 171 ---------------------------------------
+files changed, 173 deletions(-)
+diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.h
++++ b/target/hexagon/fma_emu.h
+@@ -XXX,XX +XXX,XX @@ static inline uint32_t float32_getexp_raw(float32 f32)
+ }
+ int32_t float32_getexp(float32 f32);
+ float32 infinite_float32(uint8_t sign);
+-float32 internal_fmafx(float32 a, float32 b, float32 c,
+-                       int scale, float_status *fp_status);
+ float64 internal_mpyhh(float64 a, float64 b,
+                        unsigned long long int accumulated,
+                        float_status *fp_status);
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.c
++++ b/target/hexagon/fma_emu.c
+@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
+     return -1;
+ }
+-static uint64_t float32_getmant(float32 f32)
+-{
+-    Float a = { .i = f32 };
+-    if (float32_is_normal(f32)) {
+-        return a.mant | 1ULL << 23;
+-    }
+-    if (float32_is_zero(f32)) {
+-        return 0;
+-    }
+-    if (float32_is_denormal(f32)) {
+-        return a.mant;
+-    }
+-    return ~0ULL;
+-}
+-
+ int32_t float32_getexp(float32 f32)
+ {
+     Float a = { .i = f32 };
+@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
+ }
+ /* Return a maximum finite value with the requested sign */
+-static float32 maxfinite_float32(uint8_t sign)
+-{
+-    if (sign) {
+-        return make_float32(SF_MINUS_MAXF);
+-    } else {
+-        return make_float32(SF_MAXF);
+-    }
+-}
+-
+-/* Return a zero value with requested sign */
+-static float32 zero_float32(uint8_t sign)
+-{
+-    if (sign) {
+-        return make_float32(0x80000000);
+-    } else {
+-        return float32_zero;
+-    }
+-}
+-
+ #define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
+ static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
+ { \
+@@ -XXX,XX +XXX,XX @@ static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
+ }
+ GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
+-GEN_XF_ROUND(float32, SF_MANTBITS, SF_INF_EXP, Float)
+-
+-static bool is_inf_prod(float64 a, float64 b)
+-{
+-    return ((float64_is_infinity(a) && float64_is_infinity(b)) ||
+-            (float64_is_infinity(a) && is_finite(b) && (!float64_is_zero(b))) ||
+-            (float64_is_infinity(b) && is_finite(a) && (!float64_is_zero(a))));
+-}
+-
+-static float64 special_fma(float64 a, float64 b, float64 c,
+-                           float_status *fp_status)
+-{
+-    float64 ret = make_float64(0);
+-
+-    /*
+-     * If A multiplied by B is an exact infinity and C is also an infinity
+-     * but with the opposite sign, FMA returns NaN and raises invalid.
+-     */
+-    uint8_t a_sign = float64_is_neg(a);
+-    uint8_t b_sign = float64_is_neg(b);
+-    uint8_t c_sign = float64_is_neg(c);
+-    if (is_inf_prod(a, b) && float64_is_infinity(c)) {
+-        if ((a_sign ^ b_sign) != c_sign) {
+-            ret = make_float64(DF_NAN);
+-            float_raise(float_flag_invalid, fp_status);
+-            return ret;
+-        }
+-    }
+-    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+-        (float64_is_zero(a) && float64_is_infinity(b))) {
+-        ret = make_float64(DF_NAN);
+-        float_raise(float_flag_invalid, fp_status);
+-        return ret;
+-    }
+-    /*
+-     * If none of the above checks are true and C is a NaN,
+-     * a NaN shall be returned
+-     * If A or B are NaN, a NAN shall be returned.
+-     */
+-    if (float64_is_any_nan(a) ||
+-        float64_is_any_nan(b) ||
+-        float64_is_any_nan(c)) {
+-        if (float64_is_any_nan(a) && (fGETBIT(51, a) == 0)) {
+-            float_raise(float_flag_invalid, fp_status);
+-        }
+-        if (float64_is_any_nan(b) && (fGETBIT(51, b) == 0)) {
+-            float_raise(float_flag_invalid, fp_status);
+-        }
+-        if (float64_is_any_nan(c) && (fGETBIT(51, c) == 0)) {
+-            float_raise(float_flag_invalid, fp_status);
+-        }
+-        ret = make_float64(DF_NAN);
+-        return ret;
+-    }
+-    /*
+-     * We have checked for adding opposite-signed infinities.
+-     * Other infinities return infinity with the correct sign
+-     */
+-    if (float64_is_infinity(c)) {
+-        ret = infinite_float64(c_sign);
+-        return ret;
+-    }
+-    if (float64_is_infinity(a) || float64_is_infinity(b)) {
+-        ret = infinite_float64(a_sign ^ b_sign);
+-        return ret;
+-    }
+-    g_assert_not_reached();
+-}
+-
+-static float32 special_fmaf(float32 a, float32 b, float32 c,
+-                            float_status *fp_status)
+-{
+-    float64 aa, bb, cc;
+-    aa = float32_to_float64(a, fp_status);
+-    bb = float32_to_float64(b, fp_status);
+-    cc = float32_to_float64(c, fp_status);
+-    return float64_to_float32(special_fma(aa, bb, cc, fp_status), fp_status);
+-}
+-
+-float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
+-                       float_status *fp_status)
+-{
+-    Accum prod;
+-    Accum acc;
+-    Accum result;
+-    accum_init(&prod);
+-    accum_init(&acc);
+-    accum_init(&result);
+-
+-    uint8_t a_sign = float32_is_neg(a);
+-    uint8_t b_sign = float32_is_neg(b);
+-    uint8_t c_sign = float32_is_neg(c);
+-    if (float32_is_infinity(a) ||
+-        float32_is_infinity(b) ||
+-        float32_is_infinity(c)) {
+-        return special_fmaf(a, b, c, fp_status);
+-    }
+-    if (float32_is_any_nan(a) ||
+-        float32_is_any_nan(b) ||
+-        float32_is_any_nan(c)) {
+-        return special_fmaf(a, b, c, fp_status);
+-    }
+-    if ((scale == 0) && (float32_is_zero(a) || float32_is_zero(b))) {
+-        float32 tmp = float32_mul(a, b, fp_status);
+-        tmp = float32_add(tmp, c, fp_status);
+-        return tmp;
+-    }
+-
+-    /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
+-    prod.mant = int128_mul_6464(float32_getmant(a), float32_getmant(b));
+-
+-    /*
+-     * Note: extracting the mantissa into an int is multiplying by
+-     * 2**23, so adjust here
+-     */
+-    prod.exp = float32_getexp(a) + float32_getexp(b) - SF_BIAS - 23;
+-    prod.sign = a_sign ^ b_sign;
+-    if (float32_is_zero(a) || float32_is_zero(b)) {
+-        prod.exp = -2 * WAY_BIG_EXP;
+-    }
+-    if ((scale > 0) && float32_is_denormal(c)) {
+-        acc.mant = int128_mul_6464(0, 0);
+-        acc.exp = -WAY_BIG_EXP;
+-        acc.sign = c_sign;
+-        acc.sticky = 1;
+-        result = accum_add(prod, acc);
+-    } else if (!float32_is_zero(c)) {
+-        acc.mant = int128_mul_6464(float32_getmant(c), 1);
+-        acc.exp = float32_getexp(c);
+-        acc.sign = c_sign;
+-        result = accum_add(prod, acc);
+-    } else {
+-        result = prod;
+-    }
+-    result.exp += scale;
+-    return accum_round_float32(result, fp_status);
+-}
+ float64 internal_mpyhh(float64 a, float64 b,
+                       unsigned long long int accumulated,
+--
+.43.0

-New patch
+[PULL 67/72] target/hexagon: Expand GEN_XF_ROUND
+This massive macro is now only used once.
+Expand it for use only by float64.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/fma_emu.c | 255 +++++++++++++++++++--------------------
+file changed, 127 insertions(+), 128 deletions(-)
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.c
++++ b/target/hexagon/fma_emu.c
+@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
+ }
+ /* Return a maximum finite value with the requested sign */
+-#define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
+-static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
+-{ \
+-    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0) \
+-        && ((a.guard | a.round | a.sticky) == 0)) { \
+-        /* result zero */ \
+-        switch (fp_status->float_rounding_mode) { \
+-        case float_round_down: \
+-            return zero_##SUFFIX(1); \
+-        default: \
+-            return zero_##SUFFIX(0); \
+-        } \
+-    } \
+-    /* Normalize right */ \
+-    /* We want MANTBITS bits of mantissa plus the leading one. */ \
+-    /* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
+-    /* So we need to normalize right while the high word is non-zero and \
+-    * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
+-    while ((int128_gethi(a.mant) != 0) || \
+-           ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0)) { \
+-        a = accum_norm_right(a, 1); \
+-    } \
+-    /* \
+-     * OK, now normalize left \
+-     * We want to normalize left until we have a leading one in bit 24 \
+-     * Theoretically, we only need to shift a maximum of one to the left if we \
+-     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky \
+-     * should be 0  \
+-     */ \
+-    while ((int128_getlo(a.mant) & (1ULL << MANTBITS)) == 0) { \
+-        a = accum_norm_left(a); \
+-    } \
+-    /* \
+-     * OK, now we might need to denormalize because of potential underflow. \
+-     * We need to do this before rounding, and rounding might make us normal \
+-     * again \
+-     */ \
+-    while (a.exp <= 0) { \
+-        a = accum_norm_right(a, 1 - a.exp); \
+-        /* \
+-         * Do we have underflow? \
+-         * That's when we get an inexact answer because we ran out of bits \
+-         * in a denormal. \
+-         */ \
+-        if (a.guard || a.round || a.sticky) { \
+-            float_raise(float_flag_underflow, fp_status); \
+-        } \
+-    } \
+-    /* OK, we're relatively canonical... now we need to round */ \
+-    if (a.guard || a.round || a.sticky) { \
+-        float_raise(float_flag_inexact, fp_status); \
+-        switch (fp_status->float_rounding_mode) { \
+-        case float_round_to_zero: \
+-            /* Chop and we're done */ \
+-            break; \
+-        case float_round_up: \
+-            if (a.sign == 0) { \
+-                a.mant = int128_add(a.mant, int128_one()); \
+-            } \
+-            break; \
+-        case float_round_down: \
+-            if (a.sign != 0) { \
+-                a.mant = int128_add(a.mant, int128_one()); \
+-            } \
+-            break; \
+-        default: \
+-            if (a.round || a.sticky) { \
+-                /* round up if guard is 1, down if guard is zero */ \
+-                a.mant = int128_add(a.mant, int128_make64(a.guard)); \
+-            } else if (a.guard) { \
+-                /* exactly .5, round up if odd */ \
+-                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one())); \
+-            } \
+-            break; \
+-        } \
+-    } \
+-    /* \
+-     * OK, now we might have carried all the way up. \
+-     * So we might need to shr once \
+-     * at least we know that the lsb should be zero if we rounded and \
+-     * got a carry out... \
+-     */ \
+-    if ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0) { \
+-        a = accum_norm_right(a, 1); \
+-    } \
+-    /* Overflow? */ \
+-    if (a.exp >= INF_EXP) { \
+-        /* Yep, inf result */ \
+-        float_raise(float_flag_overflow, fp_status); \
+-        float_raise(float_flag_inexact, fp_status); \
+-        switch (fp_status->float_rounding_mode) { \
+-        case float_round_to_zero: \
+-            return maxfinite_##SUFFIX(a.sign); \
+-        case float_round_up: \
+-            if (a.sign == 0) { \
+-                return infinite_##SUFFIX(a.sign); \
+-            } else { \
+-                return maxfinite_##SUFFIX(a.sign); \
+-            } \
+-        case float_round_down: \
+-            if (a.sign != 0) { \
+-                return infinite_##SUFFIX(a.sign); \
+-            } else { \
+-                return maxfinite_##SUFFIX(a.sign); \
+-            } \
+-        default: \
+-            return infinite_##SUFFIX(a.sign); \
+-        } \
+-    } \
+-    /* Underflow? */ \
+-    if (int128_getlo(a.mant) & (1ULL << MANTBITS)) { \
+-        /* Leading one means: No, we're normal. So, we should be done... */ \
+-        INTERNAL_TYPE ret; \
+-        ret.i = 0; \
+-        ret.sign = a.sign; \
+-        ret.exp = a.exp; \
+-        ret.mant = int128_getlo(a.mant); \
+-        return ret.i; \
+-    } \
+-    assert(a.exp == 1); \
+-    INTERNAL_TYPE ret; \
+-    ret.i = 0; \
+-    ret.sign = a.sign; \
+-    ret.exp = 0; \
+-    ret.mant = int128_getlo(a.mant); \
+-    return ret.i; \
++static float64 accum_round_float64(Accum a, float_status *fp_status)
++{
++    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
++        && ((a.guard | a.round | a.sticky) == 0)) {
++        /* result zero */
++        switch (fp_status->float_rounding_mode) {
++        case float_round_down:
++            return zero_float64(1);
++        default:
++            return zero_float64(0);
++        }
++    }
++    /*
++     * Normalize right
++     * We want DF_MANTBITS bits of mantissa plus the leading one.
++     * That means that we want DF_MANTBITS+1 bits, or 0x000000000000FF_FFFF
++     * So we need to normalize right while the high word is non-zero and
++     * while the low word is nonzero when masked with 0xffe0_0000_0000_0000
++     */
++    while ((int128_gethi(a.mant) != 0) ||
++           ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0)) {
++        a = accum_norm_right(a, 1);
++    }
++    /*
++     * OK, now normalize left
++     * We want to normalize left until we have a leading one in bit 24
++     * Theoretically, we only need to shift a maximum of one to the left if we
++     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky
++     * should be 0
++     */
++    while ((int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) == 0) {
++        a = accum_norm_left(a);
++    }
++    /*
++     * OK, now we might need to denormalize because of potential underflow.
++     * We need to do this before rounding, and rounding might make us normal
++     * again
++     */
++    while (a.exp <= 0) {
++        a = accum_norm_right(a, 1 - a.exp);
++        /*
++         * Do we have underflow?
++         * That's when we get an inexact answer because we ran out of bits
++         * in a denormal.
++         */
++        if (a.guard || a.round || a.sticky) {
++            float_raise(float_flag_underflow, fp_status);
++        }
++    }
++    /* OK, we're relatively canonical... now we need to round */
++    if (a.guard || a.round || a.sticky) {
++        float_raise(float_flag_inexact, fp_status);
++        switch (fp_status->float_rounding_mode) {
++        case float_round_to_zero:
++            /* Chop and we're done */
++            break;
++        case float_round_up:
++            if (a.sign == 0) {
++                a.mant = int128_add(a.mant, int128_one());
++            }
++            break;
++        case float_round_down:
++            if (a.sign != 0) {
++                a.mant = int128_add(a.mant, int128_one());
++            }
++            break;
++        default:
++            if (a.round || a.sticky) {
++                /* round up if guard is 1, down if guard is zero */
++                a.mant = int128_add(a.mant, int128_make64(a.guard));
++            } else if (a.guard) {
++                /* exactly .5, round up if odd */
++                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one()));
++            }
++            break;
++        }
++    }
++    /*
++     * OK, now we might have carried all the way up.
++     * So we might need to shr once
++     * at least we know that the lsb should be zero if we rounded and
++     * got a carry out...
++     */
++    if ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0) {
++        a = accum_norm_right(a, 1);
++    }
++    /* Overflow? */
++    if (a.exp >= DF_INF_EXP) {
++        /* Yep, inf result */
++        float_raise(float_flag_overflow, fp_status);
++        float_raise(float_flag_inexact, fp_status);
++        switch (fp_status->float_rounding_mode) {
++        case float_round_to_zero:
++            return maxfinite_float64(a.sign);
++        case float_round_up:
++            if (a.sign == 0) {
++                return infinite_float64(a.sign);
++            } else {
++                return maxfinite_float64(a.sign);
++            }
++        case float_round_down:
++            if (a.sign != 0) {
++                return infinite_float64(a.sign);
++            } else {
++                return maxfinite_float64(a.sign);
++            }
++        default:
++            return infinite_float64(a.sign);
++        }
++    }
++    /* Underflow? */
++    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
++        /* Leading one means: No, we're normal. So, we should be done... */
++        Double ret;
++        ret.i = 0;
++        ret.sign = a.sign;
++        ret.exp = a.exp;
++        ret.mant = int128_getlo(a.mant);
++        return ret.i;
++    }
++    assert(a.exp == 1);
++    Double ret;
++    ret.i = 0;
++    ret.sign = a.sign;
++    ret.exp = 0;
++    ret.mant = int128_getlo(a.mant);
++    return ret.i;
+ }
+-GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
+-
+ float64 internal_mpyhh(float64 a, float64 b,
+                       unsigned long long int accumulated,
+                       float_status *fp_status)
+--
+.43.0

-New patch
+[PULL 68/72] target/hexagon: Remove Float
+This structure, with bitfields, is incorrect for big-endian.
+Use the existing float32_getexp_raw which uses extract32.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/fma_emu.c | 16 +++-------------
+file changed, 3 insertions(+), 13 deletions(-)
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.c
++++ b/target/hexagon/fma_emu.c
+@@ -XXX,XX +XXX,XX @@ typedef union {
+     };
+ } Double;
+-typedef union {
+-    float f;
+-    uint32_t i;
+-    struct {
+-        uint32_t mant:23;
+-        uint32_t exp:8;
+-        uint32_t sign:1;
+-    };
+-} Float;
+-
+ static uint64_t float64_getmant(float64 f64)
+ {
+     Double a = { .i = f64 };
+@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
+ int32_t float32_getexp(float32 f32)
+ {
+-    Float a = { .i = f32 };
++    int exp = float32_getexp_raw(f32);
+     if (float32_is_normal(f32)) {
+-        return a.exp;
++        return exp;
+     }
+     if (float32_is_denormal(f32)) {
+-        return a.exp + 1;
++        return exp + 1;
+     }
+     return -1;
+ }
+--
+.43.0

-New patch
+[PULL 69/72] target/hexagon: Remove Double
+This structure, with bitfields, is incorrect for big-endian.
+Use extract64 and deposit64 instead.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/fma_emu.c | 46 ++++++++++++++--------------------------
+file changed, 16 insertions(+), 30 deletions(-)
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.c
++++ b/target/hexagon/fma_emu.c
+@@ -XXX,XX +XXX,XX @@
+ #define WAY_BIG_EXP 4096
+-typedef union {
+-    double f;
+-    uint64_t i;
+-    struct {
+-        uint64_t mant:52;
+-        uint64_t exp:11;
+-        uint64_t sign:1;
+-    };
+-} Double;
+-
+ static uint64_t float64_getmant(float64 f64)
+ {
+-    Double a = { .i = f64 };
++    uint64_t mant = extract64(f64, 0, 52);
+     if (float64_is_normal(f64)) {
+-        return a.mant | 1ULL << 52;
++        return mant | 1ULL << 52;
+     }
+     if (float64_is_zero(f64)) {
+         return 0;
+     }
+     if (float64_is_denormal(f64)) {
+-        return a.mant;
++        return mant;
+     }
+     return ~0ULL;
+ }
+ int32_t float64_getexp(float64 f64)
+ {
+-    Double a = { .i = f64 };
++    int exp = extract64(f64, 52, 11);
+     if (float64_is_normal(f64)) {
+-        return a.exp;
++        return exp;
+     }
+     if (float64_is_denormal(f64)) {
+-        return a.exp + 1;
++        return exp + 1;
+     }
+     return -1;
+ }
+@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
+ /* Return a maximum finite value with the requested sign */
+ static float64 accum_round_float64(Accum a, float_status *fp_status)
+ {
++    uint64_t ret;
++
+     if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
+         && ((a.guard | a.round | a.sticky) == 0)) {
+         /* result zero */
+@@ -XXX,XX +XXX,XX @@ static float64 accum_round_float64(Accum a, float_status *fp_status)
+         }
+     }
+     /* Underflow? */
+-    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
++    ret = int128_getlo(a.mant);
++    if (ret & (1ULL << DF_MANTBITS)) {
+         /* Leading one means: No, we're normal. So, we should be done... */
+-        Double ret;
+-        ret.i = 0;
+-        ret.sign = a.sign;
+-        ret.exp = a.exp;
+-        ret.mant = int128_getlo(a.mant);
+-        return ret.i;
++        ret = deposit64(ret, 52, 11, a.exp);
++    } else {
++        assert(a.exp == 1);
++        ret = deposit64(ret, 52, 11, 0);
+     }
+-    assert(a.exp == 1);
+-    Double ret;
+-    ret.i = 0;
+-    ret.sign = a.sign;
+-    ret.exp = 0;
+-    ret.mant = int128_getlo(a.mant);
+-    return ret.i;
++    ret = deposit64(ret, 63, 1, a.sign);
++    return ret;
+ }
+ float64 internal_mpyhh(float64 a, float64 b,
+--
+.43.0

-[PULL 21/24] tcg: Remove tcg_gen_dup{8,16,32,64}i_vec
+[PULL 70/72] target/hexagon: Use mulu64 for int128_mul_6464
-These interfaces have been replaced by tcg_gen_dupi_vec
+No need to open-code 64x64->128-bit multiplication.
 and tcg_constant_vec.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op.h |  4 ----
+ target/hexagon/fma_emu.c | 32 +++-----------------------------
- tcg/tcg-op-vec.c     | 20 --------------------
+file changed, 3 insertions(+), 29 deletions(-)
 files changed, 24 deletions(-)
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/target/hexagon/fma_emu.c
-+++ b/include/tcg/tcg-op.h
++++ b/target/hexagon/fma_emu.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32)
- void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
+     return -1;
  void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
  void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
 -void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
 -void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
 -void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
 -void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
  void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
  void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
  void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
      return tcg_const_ones_vec(t->base_type);
  }
--void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+-static uint32_t int128_getw0(Int128 x)
 -{
--    tcg_gen_dupi_vec(MO_64, r, a);
+-    return int128_getlo(x);
 -}
 -
--void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
+-static uint32_t int128_getw1(Int128 x)
 -{
--    tcg_gen_dupi_vec(MO_32, r, a);
+-    return int128_getlo(x) >> 32;
 -}
 -
--void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
+ static Int128 int128_mul_6464(uint64_t ai, uint64_t bi)
--{
+ {
--    tcg_gen_dupi_vec(MO_16, r, a);
+-    Int128 a, b;
--}
+-    uint64_t pp0, pp1a, pp1b, pp1s, pp2;
 +    uint64_t l, h;
 -    a = int128_make64(ai);
 -    b = int128_make64(bi);
 -    pp0 = (uint64_t)int128_getw0(a) * (uint64_t)int128_getw0(b);
 -    pp1a = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw0(b);
 -    pp1b = (uint64_t)int128_getw1(b) * (uint64_t)int128_getw0(a);
 -    pp2 = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw1(b);
 -
--void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
+-    pp1s = pp1a + pp1b;
--{
+-    if ((pp1s < pp1a) || (pp1s < pp1b)) {
--    tcg_gen_dupi_vec(MO_8, r, a);
+-        pp2 += (1ULL << 32);
--}
+-    }
 -    uint64_t ret_low = pp0 + (pp1s << 32);
 -    if ((ret_low < pp0) || (ret_low < (pp1s << 32))) {
 -        pp2 += 1;
 -    }
 -
- void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
+-    return int128_make128(ret_low, pp2 + (pp1s >> 32));
- {
++    mulu64(&l, &h, ai, bi);
-     TCGTemp *rt = tcgv_vec_temp(r);
++    return int128_make128(l, h);
  }
  static Int128 int128_sub_borrow(Int128 a, Int128 b, int borrow)
 --
-.25.1
+.43.0

-New patch
+[PULL 71/72] target/hexagon: Simplify internal_mpyhh setup
+Initialize x with accumulated via direct assignment,
+rather than multiplying by 1.
+Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ target/hexagon/fma_emu.c | 2 +-
+file changed, 1 insertion(+), 1 deletion(-)
+diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
+index XXXXXXX..XXXXXXX 100644
+--- a/target/hexagon/fma_emu.c
++++ b/target/hexagon/fma_emu.c
+@@ -XXX,XX +XXX,XX @@ float64 internal_mpyhh(float64 a, float64 b,
+         float64_is_infinity(b)) {
+         return float64_mul(a, b, fp_status);
+     }
+-    x.mant = int128_mul_6464(accumulated, 1);
++    x.mant = int128_make64(accumulated);
+     x.sticky = sticky;
+     prod = fGETUWORD(1, float64_getmant(a)) * fGETUWORD(1, float64_getmant(b));
+     x.mant = int128_add(x.mant, int128_mul_6464(prod, 0x100000000ULL));
+--
+.43.0

-[PULL 17/24] tcg/tci: Add special tci_movi_{i32,i64} opcodes
+[PULL 72/72] accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core
-The normal movi opcodes are going away.  We need something
+Convert all targets simultaneously, as the gen_intermediate_code
-for TCI to use internally.
+function disappears from the target.  While there are possible
 workarounds, they're larger than simply performing the conversion.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-opc.h    | 8 ++++++++
+ include/exec/translator.h        | 14 --------------
- tcg/tci.c                | 4 ++--
+ include/hw/core/tcg-cpu-ops.h    | 13 +++++++++++++
- tcg/tci/tcg-target.c.inc | 4 ++--
+ target/alpha/cpu.h               |  2 ++
-files changed, 12 insertions(+), 4 deletions(-)
+ target/arm/internals.h           |  2 ++
  target/avr/cpu.h                 |  2 ++
  target/hexagon/cpu.h             |  2 ++
  target/hppa/cpu.h                |  2 ++
  target/i386/tcg/helper-tcg.h     |  2 ++
  target/loongarch/internals.h     |  2 ++
  target/m68k/cpu.h                |  2 ++
  target/microblaze/cpu.h          |  2 ++
  target/mips/tcg/tcg-internal.h   |  2 ++
  target/openrisc/cpu.h            |  2 ++
  target/ppc/cpu.h                 |  2 ++
  target/riscv/cpu.h               |  3 +++
  target/rx/cpu.h                  |  2 ++
  target/s390x/s390x-internal.h    |  2 ++
  target/sh4/cpu.h                 |  2 ++
  target/sparc/cpu.h               |  2 ++
  target/tricore/cpu.h             |  2 ++
  target/xtensa/cpu.h              |  2 ++
  accel/tcg/cpu-exec.c             |  8 +++++---
  accel/tcg/translate-all.c        |  8 +++++---
  target/alpha/cpu.c               |  1 +
  target/alpha/translate.c         |  4 ++--
  target/arm/cpu.c                 |  1 +
  target/arm/tcg/cpu-v7m.c         |  1 +
  target/arm/tcg/translate.c       |  5 ++---
  target/avr/cpu.c                 |  1 +
  target/avr/translate.c           |  6 +++---
  target/hexagon/cpu.c             |  1 +
  target/hexagon/translate.c       |  4 ++--
  target/hppa/cpu.c                |  1 +
  target/hppa/translate.c          |  4 ++--
  target/i386/tcg/tcg-cpu.c        |  1 +
  target/i386/tcg/translate.c      |  5 ++---
  target/loongarch/cpu.c           |  1 +
  target/loongarch/tcg/translate.c |  4 ++--
  target/m68k/cpu.c                |  1 +
  target/m68k/translate.c          |  4 ++--
  target/microblaze/cpu.c          |  1 +
  target/microblaze/translate.c    |  4 ++--
  target/mips/cpu.c                |  1 +
  target/mips/tcg/translate.c      |  4 ++--
  target/openrisc/cpu.c            |  1 +
  target/openrisc/translate.c      |  4 ++--
  target/ppc/cpu_init.c            |  1 +
  target/ppc/translate.c           |  4 ++--
  target/riscv/tcg/tcg-cpu.c       |  1 +
  target/riscv/translate.c         |  4 ++--
  target/rx/cpu.c                  |  1 +
  target/rx/translate.c            |  4 ++--
  target/s390x/cpu.c               |  1 +
  target/s390x/tcg/translate.c     |  4 ++--
  target/sh4/cpu.c                 |  1 +
  target/sh4/translate.c           |  4 ++--
  target/sparc/cpu.c               |  1 +
  target/sparc/translate.c         |  4 ++--
  target/tricore/cpu.c             |  1 +
  target/tricore/translate.c       |  5 ++---
  target/xtensa/cpu.c              |  1 +
  target/xtensa/translate.c        |  4 ++--
 files changed, 121 insertions(+), 62 deletions(-)
-diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
+diff --git a/include/exec/translator.h b/include/exec/translator.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-opc.h
+--- a/include/exec/translator.h
-+++ b/include/tcg/tcg-opc.h
++++ b/include/exec/translator.h
-@@ -XXX,XX +XXX,XX @@ DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
+@@ -XXX,XX +XXX,XX @@
- #include "tcg-target.opc.h"
+ #include "qemu/bswap.h"
  #include "exec/vaddr.h"
 -/**
 - * gen_intermediate_code
 - * @cpu: cpu context
 - * @tb: translation block
 - * @max_insns: max number of instructions to translate
 - * @pc: guest virtual program counter address
 - * @host_pc: host physical program counter address
 - *
 - * This function must be provided by the target, which should create
 - * the target-specific DisasContext, and then invoke translator_loop.
 - */
 -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc);
 -
  /**
   * DisasJumpType:
   * @DISAS_NEXT: Next instruction in program order.
 diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/core/tcg-cpu-ops.h
 +++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@ struct TCGCPUOps {
       * Called when the first CPU is realized.
       */
      void (*initialize)(void);
 +    /**
 +     * @translate_code: Translate guest instructions to TCGOps
 +     * @cpu: cpu context
 +     * @tb: translation block
 +     * @max_insns: max number of instructions to translate
 +     * @pc: guest virtual program counter address
 +     * @host_pc: host physical program counter address
 +     *
 +     * This function must be provided by the target, which should create
 +     * the target-specific DisasContext, and then invoke translator_loop.
 +     */
 +    void (*translate_code)(CPUState *cpu, TranslationBlock *tb,
 +                           int *max_insns, vaddr pc, void *host_pc);
      /**
       * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
       *
 diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.h
 +++ b/target/alpha/cpu.h
@@ -XXX,XX +XXX,XX @@ enum {
  };
  void alpha_translate_init(void);
 +void alpha_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
  #define CPU_RESOLVING_TYPE TYPE_ALPHA_CPU
 diff --git a/target/arm/internals.h b/target/arm/internals.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/internals.h
 +++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ void init_cpreg_list(ARMCPU *cpu);
  void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
  void arm_translate_init(void);
 +void arm_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  void arm_cpu_register_gdb_commands(ARMCPU *cpu);
  void aarch64_cpu_register_gdb_commands(ARMCPU *cpu, GString *,
 diff --git a/target/avr/cpu.h b/target/avr/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.h
 +++ b/target/avr/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void set_avr_feature(CPUAVRState *env, int feature)
  }
  void avr_cpu_tcg_init(void);
 +void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc);
  int cpu_avr_exec(CPUState *cpu);
 diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/cpu.h
 +++ b/target/hexagon/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc,
  typedef HexagonCPU ArchCPU;
  void hexagon_translate_init(void);
 +void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc);
  #include "exec/cpu-all.h"
 diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.h
 +++ b/target/hppa/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline int HPPA_BTLB_ENTRIES(CPUHPPAState *env)
  }
  void hppa_translate_init(void);
 +void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
 +                         int *max_insns, vaddr pc, void *host_pc);
  #define CPU_RESOLVING_TYPE TYPE_HPPA_CPU
 diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/helper-tcg.h
 +++ b/target/i386/tcg/helper-tcg.h
@@ -XXX,XX +XXX,XX @@ static inline target_long lshift(target_long x, int n)
  /* translate.c */
  void tcg_x86_init(void);
 +void x86_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  /* excp_helper.c */
  G_NORETURN void raise_exception(CPUX86State *env, int exception_index);
 diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/loongarch/internals.h
 +++ b/target/loongarch/internals.h
@@ -XXX,XX +XXX,XX @@
  #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS)
  void loongarch_translate_init(void);
 +void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
 +                              int *max_insns, vaddr pc, void *host_pc);
  void G_NORETURN do_raise_exception(CPULoongArchState *env,
                                     uint32_t exception,
 diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.h
 +++ b/target/m68k/cpu.h
@@ -XXX,XX +XXX,XX @@ int m68k_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
  int m68k_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
  void m68k_tcg_init(void);
 +void m68k_translate_code(CPUState *cs, TranslationBlock *tb,
 +                         int *max_insns, vaddr pc, void *host_pc);
  void m68k_cpu_init_gdb(M68kCPU *cpu);
  uint32_t cpu_m68k_get_ccr(CPUM68KState *env);
  void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t);
 diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.h
 +++ b/target/microblaze/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void mb_cpu_write_msr(CPUMBState *env, uint32_t val)
  }
  void mb_tcg_init(void);
 +void mb_translate_code(CPUState *cs, TranslationBlock *tb,
 +                       int *max_insns, vaddr pc, void *host_pc);
  #define CPU_RESOLVING_TYPE TYPE_MICROBLAZE_CPU
 diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/tcg-internal.h
 +++ b/target/mips/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@
  #include "cpu.h"
  void mips_tcg_init(void);
 +void mips_translate_code(CPUState *cs, TranslationBlock *tb,
 +                         int *max_insns, vaddr pc, void *host_pc);
  void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb);
  G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
 diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.h
 +++ b/target/openrisc/cpu.h
@@ -XXX,XX +XXX,XX @@ void openrisc_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
  int openrisc_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
  int openrisc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
  void openrisc_translate_init(void);
 +void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                             int *max_insns, vaddr pc, void *host_pc);
  int print_insn_or1k(bfd_vma addr, disassemble_info *info);
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/cpu.h
 +++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ extern const VMStateDescription vmstate_ppc_cpu;
  /*****************************************************************************/
  void ppc_translate_init(void);
 +void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  #if !defined(CONFIG_USER_ONLY)
  void ppc_store_sdr1(CPUPPCState *env, target_ulong value);
 diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/cpu.h
 +++ b/target/riscv/cpu.h
@@ -XXX,XX +XXX,XX @@ RISCVException smstateen_acc_ok(CPURISCVState *env, int index, uint64_t bit);
  void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv, bool virt_en);
  void riscv_translate_init(void);
 +void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
 +
  G_NORETURN void riscv_raise_exception(CPURISCVState *env,
                                        uint32_t exception, uintptr_t pc);
 diff --git a/target/rx/cpu.h b/target/rx/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.h
 +++ b/target/rx/cpu.h
@@ -XXX,XX +XXX,XX @@ int rx_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
  int rx_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
  void rx_translate_init(void);
 +void rx_translate_code(CPUState *cs, TranslationBlock *tb,
 +                       int *max_insns, vaddr pc, void *host_pc);
  void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte);
  #include "exec/cpu-all.h"
 diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/s390x-internal.h
 +++ b/target/s390x/s390x-internal.h
@@ -XXX,XX +XXX,XX @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
  /* translate.c */
  void s390x_translate_init(void);
 +void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
  void s390x_restore_state_to_opc(CPUState *cs,
                                  const TranslationBlock *tb,
                                  const uint64_t *data);
 diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.h
 +++ b/target/sh4/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                                 uintptr_t retaddr);
  void sh4_translate_init(void);
 +void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc);
  #if !defined(CONFIG_USER_ONLY)
  hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.h
 +++ b/target/sparc/cpu.h
@@ -XXX,XX +XXX,XX @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
  /* translate.c */
  void sparc_tcg_init(void);
 +void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc);
  /* fop_helper.c */
  target_ulong cpu_get_fsr(CPUSPARCState *);
 diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.h
 +++ b/target/tricore/cpu.h
@@ -XXX,XX +XXX,XX @@ FIELD(TB_FLAGS, PRIV, 0, 2)
  void cpu_state_reset(CPUTriCoreState *s);
  void tricore_tcg_init(void);
 +void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc);
  static inline void cpu_get_tb_cpu_state(CPUTriCoreState *env, vaddr *pc,
                                          uint64_t *cs_base, uint32_t *flags)
 diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.h
 +++ b/target/xtensa/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void xtensa_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
  void xtensa_collect_sr_names(const XtensaConfig *config);
  void xtensa_translate_init(void);
 +void xtensa_translate_code(CPUState *cs, TranslationBlock *tb,
 +                           int *max_insns, vaddr pc, void *host_pc);
  void **xtensa_get_regfile_by_name(const char *name, int entries, int bits);
  void xtensa_breakpoint_handler(CPUState *cs);
  void xtensa_register_core(XtensaConfigList *node);
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp)
      if (!tcg_target_initialized) {
          /* Check mandatory TCGCPUOps handlers */
 +        const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops;
  #ifndef CONFIG_USER_ONLY
 -        assert(cpu->cc->tcg_ops->cpu_exec_halt);
 -        assert(cpu->cc->tcg_ops->cpu_exec_interrupt);
 +        assert(tcg_ops->cpu_exec_halt);
 +        assert(tcg_ops->cpu_exec_interrupt);
  #endif /* !CONFIG_USER_ONLY */
 -        cpu->cc->tcg_ops->initialize();
 +        assert(tcg_ops->translate_code);
 +        tcg_ops->initialize();
          tcg_target_initialized = true;
      }
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
      tcg_func_start(tcg_ctx);
 -    tcg_ctx->cpu = env_cpu(env);
 -    gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
 +    CPUState *cs = env_cpu(env);
 +    tcg_ctx->cpu = cs;
 +    cs->cc->tcg_ops->translate_code(cs, tb, max_insns, pc, host_pc);
 +
      assert(tb->size != 0);
      tcg_ctx->cpu = NULL;
      *max_insns = tb->icount;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
              /*
               * Overflow of code_gen_buffer, or the current slice of it.
               *
 -             * TODO: We don't need to re-do gen_intermediate_code, nor
 +             * TODO: We don't need to re-do tcg_ops->translate_code, nor
               * should we re-do the tcg optimization currently hidden
               * inside tcg_gen_code.  All that should be required is to
               * flush the TBs, allocate a new TB, re-initialize it per
 diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/cpu.c
 +++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
  static const TCGCPUOps alpha_tcg_ops = {
      .initialize = alpha_translate_init,
 +    .translate_code = alpha_translate_code,
      .synchronize_from_tb = alpha_cpu_synchronize_from_tb,
      .restore_state_to_opc = alpha_restore_state_to_opc,
 diff --git a/target/alpha/translate.c b/target/alpha/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/alpha/translate.c
 +++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps alpha_tr_ops = {
      .tb_stop            = alpha_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void alpha_translate_code(CPUState *cpu, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc;
      translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base);
 diff --git a/target/arm/cpu.c b/target/arm/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/cpu.c
 +++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps arm_sysemu_ops = {
  #ifdef CONFIG_TCG
  static const TCGCPUOps arm_tcg_ops = {
      .initialize = arm_translate_init,
 +    .translate_code = arm_translate_code,
      .synchronize_from_tb = arm_cpu_synchronize_from_tb,
      .debug_excp_handler = arm_debug_excp_handler,
      .restore_state_to_opc = arm_restore_state_to_opc,
 diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/cpu-v7m.c
 +++ b/target/arm/tcg/cpu-v7m.c
@@ -XXX,XX +XXX,XX @@ static void cortex_m55_initfn(Object *obj)
  static const TCGCPUOps arm_v7m_tcg_ops = {
      .initialize = arm_translate_init,
 +    .translate_code = arm_translate_code,
      .synchronize_from_tb = arm_cpu_synchronize_from_tb,
      .debug_excp_handler = arm_debug_excp_handler,
      .restore_state_to_opc = arm_restore_state_to_opc,
 diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/tcg/translate.c
 +++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps thumb_translator_ops = {
      .tb_stop            = arm_tr_tb_stop,
  };
 -/* generate intermediate code for basic block 'tb'.  */
 -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void arm_translate_code(CPUState *cpu, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc = { };
      const TranslatorOps *ops = &arm_translator_ops;
 diff --git a/target/avr/cpu.c b/target/avr/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/cpu.c
 +++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps avr_sysemu_ops = {
  static const TCGCPUOps avr_tcg_ops = {
      .initialize = avr_cpu_tcg_init,
 +    .translate_code = avr_cpu_translate_code,
      .synchronize_from_tb = avr_cpu_synchronize_from_tb,
      .restore_state_to_opc = avr_restore_state_to_opc,
      .cpu_exec_interrupt = avr_cpu_exec_interrupt,
 diff --git a/target/avr/translate.c b/target/avr/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/avr/translate.c
 +++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@ static bool trans_WDR(DisasContext *ctx, arg_WDR *a)
   *
   *    - translate()
   *    - canonicalize_skip()
 - *    - gen_intermediate_code()
 + *    - translate_code()
   *    - restore_state_to_opc()
   *
   */
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps avr_tr_ops = {
      .tb_stop            = avr_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc = { };
      translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base);
 diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/cpu.c
 +++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_init(Object *obj)
  static const TCGCPUOps hexagon_tcg_ops = {
      .initialize = hexagon_translate_init,
 +    .translate_code = hexagon_translate_code,
      .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
      .restore_state_to_opc = hexagon_restore_state_to_opc,
  };
 diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hexagon/translate.c
 +++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hexagon_tr_ops = {
      .tb_stop            = hexagon_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/cpu.c
 +++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps hppa_sysemu_ops = {
  static const TCGCPUOps hppa_tcg_ops = {
      .initialize = hppa_translate_init,
 +    .translate_code = hppa_translate_code,
      .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
      .restore_state_to_opc = hppa_restore_state_to_opc,
 diff --git a/target/hppa/translate.c b/target/hppa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/hppa/translate.c
 +++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hppa_tr_ops = {
  #endif
+ };
-+#ifdef TCG_TARGET_INTERPRETER
-+/* These opcodes are only for use between the tci generator and interpreter. */
+-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-+DEF(tci_movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
+-                           vaddr pc, void *host_pc)
-+#if TCG_TARGET_REG_BITS == 64
++void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
-+DEF(tci_movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
++                         int *max_insns, vaddr pc, void *host_pc)
-+#endif
+ {
-+#endif
+     DisasContext ctx = { };
-+
+     translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base);
- #undef TLADDR_ARGS
+diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
- #undef DATA64_ARGS
+index XXXXXXX..XXXXXXX 100644
- #undef IMPL
+--- a/target/i386/tcg/tcg-cpu.c
-diff --git a/tcg/tci.c b/tcg/tci.c
++++ b/target/i386/tcg/tcg-cpu.c
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ static bool x86_debug_check_breakpoint(CPUState *cs)
---- a/tcg/tci.c
-+++ b/tcg/tci.c
+ static const TCGCPUOps x86_tcg_ops = {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+     .initialize = tcg_x86_init,
-             t1 = tci_read_r32(regs, &tb_ptr);
++    .translate_code = x86_translate_code,
-             tci_write_reg32(regs, t0, t1);
+     .synchronize_from_tb = x86_cpu_synchronize_from_tb,
-             break;
+     .restore_state_to_opc = x86_restore_state_to_opc,
--        case INDEX_op_movi_i32:
+     .cpu_exec_enter = x86_cpu_exec_enter,
-+        case INDEX_op_tci_movi_i32:
+diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
-             t0 = *tb_ptr++;
+index XXXXXXX..XXXXXXX 100644
-             t1 = tci_read_i32(&tb_ptr);
+--- a/target/i386/tcg/translate.c
-             tci_write_reg32(regs, t0, t1);
++++ b/target/i386/tcg/translate.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps i386_tr_ops = {
-             t1 = tci_read_r64(regs, &tb_ptr);
+     .tb_stop            = i386_tr_tb_stop,
-             tci_write_reg64(regs, t0, t1);
+ };
-             break;
--        case INDEX_op_movi_i64:
+-/* generate intermediate code for basic block 'tb'.  */
-+        case INDEX_op_tci_movi_i64:
+-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-             t0 = *tb_ptr++;
+-                           vaddr pc, void *host_pc)
-             t1 = tci_read_i64(&tb_ptr);
++void x86_translate_code(CPUState *cpu, TranslationBlock *tb,
-             tci_write_reg64(regs, t0, t1);
++                        int *max_insns, vaddr pc, void *host_pc)
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+ {
-index XXXXXXX..XXXXXXX 100644
+     DisasContext dc;
---- a/tcg/tci/tcg-target.c.inc
-+++ b/tcg/tci/tcg-target.c.inc
+diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
+index XXXXXXX..XXXXXXX 100644
-     uint8_t *old_code_ptr = s->code_ptr;
+--- a/target/loongarch/cpu.c
-     uint32_t arg32 = arg;
++++ b/target/loongarch/cpu.c
-     if (type == TCG_TYPE_I32 || arg == arg32) {
+@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
--        tcg_out_op_t(s, INDEX_op_movi_i32);
-+        tcg_out_op_t(s, INDEX_op_tci_movi_i32);
+ static const TCGCPUOps loongarch_tcg_ops = {
-         tcg_out_r(s, t0);
+     .initialize = loongarch_translate_init,
-         tcg_out32(s, arg32);
++    .translate_code = loongarch_translate_code,
-     } else {
+     .synchronize_from_tb = loongarch_cpu_synchronize_from_tb,
-         tcg_debug_assert(type == TCG_TYPE_I64);
+     .restore_state_to_opc = loongarch_restore_state_to_opc,
- #if TCG_TARGET_REG_BITS == 64
--        tcg_out_op_t(s, INDEX_op_movi_i64);
+diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
-+        tcg_out_op_t(s, INDEX_op_tci_movi_i64);
+index XXXXXXX..XXXXXXX 100644
-         tcg_out_r(s, t0);
+--- a/target/loongarch/tcg/translate.c
-         tcg_out64(s, arg);
++++ b/target/loongarch/tcg/translate.c
- #else
+@@ -XXX,XX +XXX,XX @@ static const TranslatorOps loongarch_tr_ops = {
      .tb_stop            = loongarch_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
 +                              int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/cpu.c
 +++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps m68k_sysemu_ops = {
  static const TCGCPUOps m68k_tcg_ops = {
      .initialize = m68k_tcg_init,
 +    .translate_code = m68k_translate_code,
      .restore_state_to_opc = m68k_restore_state_to_opc,
  #ifndef CONFIG_USER_ONLY
 diff --git a/target/m68k/translate.c b/target/m68k/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/m68k/translate.c
 +++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps m68k_tr_ops = {
      .tb_stop            = m68k_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void m68k_translate_code(CPUState *cpu, TranslationBlock *tb,
 +                         int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc;
      translator_loop(cpu, tb, max_insns, pc, host_pc, &m68k_tr_ops, &dc.base);
 diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/cpu.c
 +++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mb_sysemu_ops = {
  static const TCGCPUOps mb_tcg_ops = {
      .initialize = mb_tcg_init,
 +    .translate_code = mb_translate_code,
      .synchronize_from_tb = mb_cpu_synchronize_from_tb,
      .restore_state_to_opc = mb_restore_state_to_opc,
 diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/microblaze/translate.c
 +++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mb_tr_ops = {
      .tb_stop            = mb_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void mb_translate_code(CPUState *cpu, TranslationBlock *tb,
 +                       int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc;
      translator_loop(cpu, tb, max_insns, pc, host_pc, &mb_tr_ops, &dc.base);
 diff --git a/target/mips/cpu.c b/target/mips/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/cpu.c
 +++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static const Property mips_cpu_properties[] = {
  #include "hw/core/tcg-cpu-ops.h"
  static const TCGCPUOps mips_tcg_ops = {
      .initialize = mips_tcg_init,
 +    .translate_code = mips_translate_code,
      .synchronize_from_tb = mips_cpu_synchronize_from_tb,
      .restore_state_to_opc = mips_restore_state_to_opc,
 diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/mips/tcg/translate.c
 +++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mips_tr_ops = {
      .tb_stop            = mips_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void mips_translate_code(CPUState *cs, TranslationBlock *tb,
 +                         int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/cpu.c
 +++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps openrisc_sysemu_ops = {
  static const TCGCPUOps openrisc_tcg_ops = {
      .initialize = openrisc_translate_init,
 +    .translate_code = openrisc_translate_code,
      .synchronize_from_tb = openrisc_cpu_synchronize_from_tb,
      .restore_state_to_opc = openrisc_restore_state_to_opc,
 diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/openrisc/translate.c
 +++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps openrisc_tr_ops = {
      .tb_stop            = openrisc_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                             int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/cpu_init.c
 +++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps ppc_sysemu_ops = {
  static const TCGCPUOps ppc_tcg_ops = {
    .initialize = ppc_translate_init,
 +  .translate_code = ppc_translate_code,
    .restore_state_to_opc = ppc_restore_state_to_opc,
  #ifdef CONFIG_USER_ONLY
 diff --git a/target/ppc/translate.c b/target/ppc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/translate.c
 +++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps ppc_tr_ops = {
      .tb_stop            = ppc_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/tcg/tcg-cpu.c
 +++ b/target/riscv/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_restore_state_to_opc(CPUState *cs,
  static const TCGCPUOps riscv_tcg_ops = {
      .initialize = riscv_translate_init,
 +    .translate_code = riscv_translate_code,
      .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
      .restore_state_to_opc = riscv_restore_state_to_opc,
 diff --git a/target/riscv/translate.c b/target/riscv/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/riscv/translate.c
 +++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps riscv_tr_ops = {
      .tb_stop            = riscv_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/rx/cpu.c b/target/rx/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/cpu.c
 +++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps rx_sysemu_ops = {
  static const TCGCPUOps rx_tcg_ops = {
      .initialize = rx_translate_init,
 +    .translate_code = rx_translate_code,
      .synchronize_from_tb = rx_cpu_synchronize_from_tb,
      .restore_state_to_opc = rx_restore_state_to_opc,
      .tlb_fill = rx_cpu_tlb_fill,
 diff --git a/target/rx/translate.c b/target/rx/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/translate.c
 +++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps rx_tr_ops = {
      .tb_stop            = rx_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void rx_translate_code(CPUState *cs, TranslationBlock *tb,
 +                       int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc;
 diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/cpu.c
 +++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ void cpu_get_tb_cpu_state(CPUS390XState *env, vaddr *pc,
  static const TCGCPUOps s390_tcg_ops = {
      .initialize = s390x_translate_init,
 +    .translate_code = s390x_translate_code,
      .restore_state_to_opc = s390x_restore_state_to_opc,
  #ifdef CONFIG_USER_ONLY
 diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/s390x/tcg/translate.c
 +++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps s390x_tr_ops = {
      .disas_log          = s390x_tr_disas_log,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc;
 diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/cpu.c
 +++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sh4_sysemu_ops = {
  static const TCGCPUOps superh_tcg_ops = {
      .initialize = sh4_translate_init,
 +    .translate_code = sh4_translate_code,
      .synchronize_from_tb = superh_cpu_synchronize_from_tb,
      .restore_state_to_opc = superh_restore_state_to_opc,
 diff --git a/target/sh4/translate.c b/target/sh4/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/translate.c
 +++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sh4_tr_ops = {
      .tb_stop            = sh4_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
 +                        int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
 diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/cpu.c
 +++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sparc_sysemu_ops = {
  static const TCGCPUOps sparc_tcg_ops = {
      .initialize = sparc_tcg_init,
 +    .translate_code = sparc_translate_code,
      .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
      .restore_state_to_opc = sparc_restore_state_to_opc,
 diff --git a/target/sparc/translate.c b/target/sparc/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sparc/translate.c
 +++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sparc_tr_ops = {
      .tb_stop            = sparc_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
 +                          int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc = {};
 diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/cpu.c
 +++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps tricore_sysemu_ops = {
  static const TCGCPUOps tricore_tcg_ops = {
      .initialize = tricore_tcg_init,
 +    .translate_code = tricore_translate_code,
      .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
      .restore_state_to_opc = tricore_restore_state_to_opc,
      .tlb_fill = tricore_cpu_tlb_fill,
 diff --git a/target/tricore/translate.c b/target/tricore/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/tricore/translate.c
 +++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps tricore_tr_ops = {
      .tb_stop            = tricore_tr_tb_stop,
  };
 -
 -void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
 +                            int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext ctx;
      translator_loop(cs, tb, max_insns, pc, host_pc,
 diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/cpu.c
 +++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps xtensa_sysemu_ops = {
  static const TCGCPUOps xtensa_tcg_ops = {
      .initialize = xtensa_translate_init,
 +    .translate_code = xtensa_translate_code,
      .debug_excp_handler = xtensa_breakpoint_handler,
      .restore_state_to_opc = xtensa_restore_state_to_opc,
 diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/xtensa/translate.c
 +++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps xtensa_translator_ops = {
      .tb_stop            = xtensa_tr_tb_stop,
  };
 -void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
 -                           vaddr pc, void *host_pc)
 +void xtensa_translate_code(CPUState *cpu, TranslationBlock *tb,
 +                           int *max_insns, vaddr pc, void *host_pc)
  {
      DisasContext dc = {};
      translator_loop(cpu, tb, max_insns, pc, host_pc,
 --
-.25.1
+.43.0

The following changes since commit 45240eed4f064576d589ea60ebadf3c11d7ab891:

Merge remote-tracking branch 'remotes/armbru/tags/pull-yank-2021-01-13' into staging (2021-01-13 14:19:24 +0000)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210113

for you to fetch changes up to 4cacecaaa2bbf8af0967bd3eee43297fada475a9:

decodetree: Open files with encoding='utf-8' (2021-01-13 08:39:08 -1000)

----------------------------------------------------------------
Improvements to tcg constant handling.
Force utf8 for decodetree.

----------------------------------------------------------------
Philippe Mathieu-Daudé (1):
      decodetree: Open files with encoding='utf-8'

Richard Henderson (23):
      tcg: Use tcg_out_dupi_vec from temp_load
      tcg: Increase tcg_out_dupi_vec immediate to int64_t
      tcg: Consolidate 3 bits into enum TCGTempKind
      tcg: Add temp_readonly
      tcg: Expand TCGTemp.val to 64-bits
      tcg: Rename struct tcg_temp_info to TempOptInfo
      tcg: Expand TempOptInfo to 64-bits
      tcg: Introduce TYPE_CONST temporaries
      tcg/optimize: Improve find_better_copy
      tcg/optimize: Adjust TempOptInfo allocation
      tcg/optimize: Use tcg_constant_internal with constant folding
      tcg: Convert tcg_gen_dupi_vec to TCG_CONST
      tcg: Use tcg_constant_i32 with icount expander
      tcg: Use tcg_constant_{i32,i64} with tcg int expanders
      tcg: Use tcg_constant_{i32,i64} with tcg plugins
      tcg: Use tcg_constant_{i32,i64,vec} with gvec expanders
      tcg/tci: Add special tci_movi_{i32,i64} opcodes
      tcg: Remove movi and dupi opcodes
      tcg: Add tcg_reg_alloc_dup2
      tcg/i386: Use tcg_constant_vec with tcg vec expanders
      tcg: Remove tcg_gen_dup{8,16,32,64}i_vec
      tcg/ppc: Use tcg_constant_vec with tcg vec expanders
      tcg/aarch64: Use tcg_constant_vec with tcg vec expanders

Having dupi pass though movi is confusing and arguably wrong.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                    |  6 +++-
 tcg/aarch64/tcg-target.c.inc |  7 ----
 tcg/i386/tcg-target.c.inc    | 63 ++++++++++++++++++++++++------------
 tcg/ppc/tcg-target.c.inc     |  6 ----
 4 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
     case TEMP_VAL_CONST:
         reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
                             preferred_regs, ts->indirect_base);
-        tcg_out_movi(s, ts->type, reg, ts->val);
+        if (ts->type <= TCG_TYPE_I64) {
+            tcg_out_movi(s, ts->type, reg, ts->val);
+        } else {
+            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
+        }
         ts->mem_coherent = 0;
         break;
     case TEMP_VAL_MEM:
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     case TCG_TYPE_I64:
         tcg_debug_assert(rd < 32);
         break;
-
-    case TCG_TYPE_V64:
-    case TCG_TYPE_V128:
-        tcg_debug_assert(rd >= 32);
-        tcg_out_dupi_vec(s, type, rd, value);
-        return;
-
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
     }
 }
 
-static void tcg_out_movi(TCGContext *s, TCGType type,
-                         TCGReg ret, tcg_target_long arg)
+static void tcg_out_movi_vec(TCGContext *s, TCGType type,
+                             TCGReg ret, tcg_target_long arg)
+{
+    if (arg == 0) {
+        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
+        return;
+    }
+    if (arg == -1) {
+        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
+        return;
+    }
+
+    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
+    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
+    if (TCG_TARGET_REG_BITS == 64) {
+        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
+    } else {
+        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+    }
+}
+
+static void tcg_out_movi_int(TCGContext *s, TCGType type,
+                             TCGReg ret, tcg_target_long arg)
 {
     tcg_target_long diff;
 
-    switch (type) {
-    case TCG_TYPE_I32:
-#if TCG_TARGET_REG_BITS == 64
-    case TCG_TYPE_I64:
-#endif
-        if (ret < 16) {
-            break;
-        }
-        /* fallthru */
-    case TCG_TYPE_V64:
-    case TCG_TYPE_V128:
-    case TCG_TYPE_V256:
-        tcg_debug_assert(ret >= 16);
-        tcg_out_dupi_vec(s, type, ret, arg);
-        return;
-    default:
-        g_assert_not_reached();
-    }
-
     if (arg == 0) {
         tgen_arithr(s, ARITH_XOR, ret, ret);
         return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     tcg_out64(s, arg);
 }
 
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         TCGReg ret, tcg_target_long arg)
+{
+    switch (type) {
+    case TCG_TYPE_I32:
+#if TCG_TARGET_REG_BITS == 64
+    case TCG_TYPE_I64:
+#endif
+        if (ret < 16) {
+            tcg_out_movi_int(s, type, ret, arg);
+        } else {
+            tcg_out_movi_vec(s, type, ret, arg);
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 {
     if (val == (int8_t)val) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
         tcg_out_movi_int(s, type, ret, arg, false);
         break;
 
-    case TCG_TYPE_V64:
-    case TCG_TYPE_V128:
-        tcg_debug_assert(ret >= TCG_REG_V0);
-        tcg_out_dupi_vec(s, type, ret, arg);
-        break;
-
     default:
         g_assert_not_reached();
     }
-- 
2.25.1

While we don't store more than tcg_target_long in TCGTemp,
we shouldn't be limited to that for code generation.  We will
be able to use this for INDEX_op_dup2_vec with 2 constants.

Also pass along the minimal vece that may be said to apply
to the constant.  This allows some simplification in the
various backends.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                    | 31 +++++++++++++++++++++++++-----
 tcg/aarch64/tcg-target.c.inc | 12 ++++++------
 tcg/i386/tcg-target.c.inc    | 22 ++++++++++++---------
 tcg/ppc/tcg-target.c.inc     | 37 +++++++++++++++++++++++-------------
 4 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg dst, TCGReg src);
 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg dst, TCGReg base, intptr_t offset);
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                             TCGReg dst, tcg_target_long arg);
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg dst, int64_t arg);
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
                            unsigned vece, const TCGArg *args,
                            const int *const_args);
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 {
     g_assert_not_reached();
 }
-static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                                    TCGReg dst, tcg_target_long arg)
+static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                                    TCGReg dst, int64_t arg)
 {
     g_assert_not_reached();
 }
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
         if (ts->type <= TCG_TYPE_I64) {
             tcg_out_movi(s, ts->type, reg, ts->val);
         } else {
-            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
+            uint64_t val = ts->val;
+            MemOp vece = MO_64;
+
+            /*
+             * Find the minimal vector element that matches the constant.
+             * The targets will, in general, have to do this search anyway,
+             * do this generically.
+             */
+            if (TCG_TARGET_REG_BITS == 32) {
+                val = dup_const(MO_32, val);
+                vece = MO_32;
+            }
+            if (val == dup_const(MO_8, val)) {
+                vece = MO_8;
+            } else if (val == dup_const(MO_16, val)) {
+                vece = MO_16;
+            } else if (TCG_TARGET_REG_BITS == 64 &&
+                       val == dup_const(MO_32, val)) {
+                vece = MO_32;
+            }
+
+            tcg_out_dupi_vec(s, ts->type, vece, reg, ts->val);
         }
         ts->mem_coherent = 0;
         break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
     tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
 }
 
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                             TCGReg rd, tcg_target_long v64)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, int64_t v64)
 {
     bool q = type == TCG_TYPE_V128;
     int cmode, imm8, i;
 
     /* Test all bytes equal first.  */
-    if (v64 == dup_const(MO_8, v64)) {
+    if (vece == MO_8) {
         imm8 = (uint8_t)v64;
         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
         return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
      * cannot find an expansion there's no point checking a larger
      * width because we already know by replication it cannot match.
      */
-    if (v64 == dup_const(MO_16, v64)) {
+    if (vece == MO_16) {
         uint16_t v16 = v64;
 
         if (is_shimm16(v16, &cmode, &imm8)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
         tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
         return;
-    } else if (v64 == dup_const(MO_32, v64)) {
+    } else if (vece == MO_32) {
         uint32_t v32 = v64;
         uint32_t n32 = ~v32;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                         tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
                         break;
                     }
-                    tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0);
+                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                     a2 = TCG_VEC_TMP;
                 }
                 insn = cmp_insn[cond];
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
     return true;
 }
 
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                             TCGReg ret, tcg_target_long arg)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg ret, int64_t arg)
 {
     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         return;
     }
 
-    if (TCG_TARGET_REG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
+        if (have_avx2) {
+            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
+        } else {
+            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
+        }
+        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+    } else {
         if (type == TCG_TYPE_V64) {
             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
         } else if (have_avx2) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         } else {
             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
         }
-        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
-    } else {
-        if (have_avx2) {
-            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
+        if (TCG_TARGET_REG_BITS == 64) {
+            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
         } else {
-            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
+            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
         }
-        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
     }
 }
 
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 }
 
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
-                             tcg_target_long val)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg ret, int64_t val)
 {
     uint32_t load_insn;
     int rel, low;
     intptr_t add;
 
-    low = (int8_t)val;
-    if (low >= -16 && low < 16) {
-        if (val == (tcg_target_long)dup_const(MO_8, low)) {
+    switch (vece) {
+    case MO_8:
+        low = (int8_t)val;
+        if (low >= -16 && low < 16) {
             tcg_out32(s, VSPLTISB | VRT(ret) | ((val & 31) << 16));
             return;
         }
-        if (val == (tcg_target_long)dup_const(MO_16, low)) {
+        if (have_isa_3_00) {
+            tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
+            return;
+        }
+        break;
+
+    case MO_16:
+        low = (int16_t)val;
+        if (low >= -16 && low < 16) {
             tcg_out32(s, VSPLTISH | VRT(ret) | ((val & 31) << 16));
             return;
         }
-        if (val == (tcg_target_long)dup_const(MO_32, low)) {
+        break;
+
+    case MO_32:
+        low = (int32_t)val;
+        if (low >= -16 && low < 16) {
             tcg_out32(s, VSPLTISW | VRT(ret) | ((val & 31) << 16));
             return;
         }
-    }
-    if (have_isa_3_00 && val == (tcg_target_long)dup_const(MO_8, val)) {
-        tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
-        return;
+        break;
     }
 
     /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
         if (TCG_TARGET_REG_BITS == 64) {
             new_pool_label(s, val, rel, s->code_ptr, add);
         } else {
-            new_pool_l2(s, rel, s->code_ptr, add, val, val);
+            new_pool_l2(s, rel, s->code_ptr, add, val >> 32, val);
         }
     } else {
         load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
         if (TCG_TARGET_REG_BITS == 64) {
             new_pool_l2(s, rel, s->code_ptr, add, val, val);
         } else {
-            new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
+            new_pool_l4(s, rel, s->code_ptr, add,
+                        val >> 32, val, val >> 32, val);
         }
     }
 
-- 
2.25.1

The temp_fixed, temp_global, temp_local bits are all related.
Combine them into a single enumeration.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  20 +++++---
 tcg/optimize.c    |   8 +--
 tcg/tcg.c         | 126 ++++++++++++++++++++++++++++------------------
 3 files changed, 92 insertions(+), 62 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempVal {
     TEMP_VAL_CONST,
 } TCGTempVal;
 
+typedef enum TCGTempKind {
+    /* Temp is dead at the end of all basic blocks. */
+    TEMP_NORMAL,
+    /* Temp is saved across basic blocks but dead at the end of TBs. */
+    TEMP_LOCAL,
+    /* Temp is saved across both basic blocks and translation blocks. */
+    TEMP_GLOBAL,
+    /* Temp is in a fixed register. */
+    TEMP_FIXED,
+} TCGTempKind;
+
 typedef struct TCGTemp {
     TCGReg reg:8;
     TCGTempVal val_type:8;
     TCGType base_type:8;
     TCGType type:8;
-    unsigned int fixed_reg:1;
+    TCGTempKind kind:3;
     unsigned int indirect_reg:1;
     unsigned int indirect_base:1;
     unsigned int mem_coherent:1;
     unsigned int mem_allocated:1;
-    /* If true, the temp is saved across both basic blocks and
-       translation blocks.  */
-    unsigned int temp_global:1;
-    /* If true, the temp is saved across basic blocks but dead
-       at the end of translation blocks.  If false, the temp is
-       dead at the end of basic blocks.  */
-    unsigned int temp_local:1;
     unsigned int temp_allocated:1;
 
     tcg_target_long val;
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
     TCGTemp *i;
 
     /* If this is already a global, we can't do better. */
-    if (ts->temp_global) {
+    if (ts->kind >= TEMP_GLOBAL) {
         return ts;
     }
 
     /* Search for a global first. */
     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-        if (i->temp_global) {
+        if (i->kind >= TEMP_GLOBAL) {
             return i;
         }
     }
 
     /* If it is a temp, search for a temp local. */
-    if (!ts->temp_local) {
+    if (ts->kind == TEMP_NORMAL) {
         for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-            if (ts->temp_local) {
+            if (i->kind >= TEMP_LOCAL) {
                 return i;
             }
         }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static inline TCGTemp *tcg_global_alloc(TCGContext *s)
     tcg_debug_assert(s->nb_globals == s->nb_temps);
     s->nb_globals++;
     ts = tcg_temp_alloc(s);
-    ts->temp_global = 1;
+    ts->kind = TEMP_GLOBAL;
 
     return ts;
 }
@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
     ts = tcg_global_alloc(s);
     ts->base_type = type;
     ts->type = type;
-    ts->fixed_reg = 1;
+    ts->kind = TEMP_FIXED;
     ts->reg = reg;
     ts->name = name;
     tcg_regset_set_reg(s->reserved_regs, reg);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
     bigendian = 1;
 #endif
 
-    if (!base_ts->fixed_reg) {
+    if (base_ts->kind != TEMP_FIXED) {
         /* We do not support double-indirect registers.  */
         tcg_debug_assert(!base_ts->indirect_reg);
         base_ts->indirect_base = 1;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
 TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
 {
     TCGContext *s = tcg_ctx;
+    TCGTempKind kind = temp_local ? TEMP_LOCAL : TEMP_NORMAL;
     TCGTemp *ts;
     int idx, k;
 
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
         ts = &s->temps[idx];
         ts->temp_allocated = 1;
         tcg_debug_assert(ts->base_type == type);
-        tcg_debug_assert(ts->temp_local == temp_local);
+        tcg_debug_assert(ts->kind == kind);
     } else {
         ts = tcg_temp_alloc(s);
         if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
             ts->base_type = type;
             ts->type = TCG_TYPE_I32;
             ts->temp_allocated = 1;
-            ts->temp_local = temp_local;
+            ts->kind = kind;
 
             tcg_debug_assert(ts2 == ts + 1);
             ts2->base_type = TCG_TYPE_I64;
             ts2->type = TCG_TYPE_I32;
             ts2->temp_allocated = 1;
-            ts2->temp_local = temp_local;
+            ts2->kind = kind;
         } else {
             ts->base_type = type;
             ts->type = type;
             ts->temp_allocated = 1;
-            ts->temp_local = temp_local;
+            ts->kind = kind;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
     }
 #endif
 
-    tcg_debug_assert(ts->temp_global == 0);
+    tcg_debug_assert(ts->kind < TEMP_GLOBAL);
     tcg_debug_assert(ts->temp_allocated != 0);
     ts->temp_allocated = 0;
 
     idx = temp_idx(ts);
-    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
+    k = ts->base_type + (ts->kind == TEMP_NORMAL ? 0 : TCG_TYPE_COUNT);
     set_bit(idx, s->free_temps[k].l);
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 static void tcg_reg_alloc_start(TCGContext *s)
 {
     int i, n;
-    TCGTemp *ts;
 
-    for (i = 0, n = s->nb_globals; i < n; i++) {
-        ts = &s->temps[i];
-        ts->val_type = (ts->fixed_reg ? TEMP_VAL_REG : TEMP_VAL_MEM);
-    }
-    for (n = s->nb_temps; i < n; i++) {
-        ts = &s->temps[i];
-        ts->val_type = (ts->temp_local ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
-        ts->mem_allocated = 0;
-        ts->fixed_reg = 0;
+    for (i = 0, n = s->nb_temps; i < n; i++) {
+        TCGTemp *ts = &s->temps[i];
+        TCGTempVal val = TEMP_VAL_MEM;
+
+        switch (ts->kind) {
+        case TEMP_FIXED:
+            val = TEMP_VAL_REG;
+            break;
+        case TEMP_GLOBAL:
+            break;
+        case TEMP_NORMAL:
+            val = TEMP_VAL_DEAD;
+            /* fall through */
+        case TEMP_LOCAL:
+            ts->mem_allocated = 0;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        ts->val_type = val;
     }
 
     memset(s->reg_to_temp, 0, sizeof(s->reg_to_temp));
@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
 {
     int idx = temp_idx(ts);
 
-    if (ts->temp_global) {
+    switch (ts->kind) {
+    case TEMP_FIXED:
+    case TEMP_GLOBAL:
         pstrcpy(buf, buf_size, ts->name);
-    } else if (ts->temp_local) {
+        break;
+    case TEMP_LOCAL:
         snprintf(buf, buf_size, "loc%d", idx - s->nb_globals);
-    } else {
+        break;
+    case TEMP_NORMAL:
         snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
+        break;
     }
     return buf;
 }
@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
 {
     int i;
 
-    for (i = 0; i < ng; ++i) {
-        s->temps[i].state = TS_DEAD | TS_MEM;
-        la_reset_pref(&s->temps[i]);
-    }
-    for (i = ng; i < nt; ++i) {
-        s->temps[i].state = (s->temps[i].temp_local
-                             ? TS_DEAD | TS_MEM
-                             : TS_DEAD);
-        la_reset_pref(&s->temps[i]);
+    for (i = 0; i < nt; ++i) {
+        TCGTemp *ts = &s->temps[i];
+        int state;
+
+        switch (ts->kind) {
+        case TEMP_FIXED:
+        case TEMP_GLOBAL:
+        case TEMP_LOCAL:
+            state = TS_DEAD | TS_MEM;
+            break;
+        case TEMP_NORMAL:
+            state = TS_DEAD;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        ts->state = state;
+        la_reset_pref(ts);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void la_bb_sync(TCGContext *s, int ng, int nt)
     la_global_sync(s, ng);
 
     for (int i = ng; i < nt; ++i) {
-        if (s->temps[i].temp_local) {
+        if (s->temps[i].kind == TEMP_LOCAL) {
             int state = s->temps[i].state;
             s->temps[i].state = state | TS_MEM;
             if (state != TS_DEAD) {
@@ -XXX,XX +XXX,XX @@ static void check_regs(TCGContext *s)
     }
     for (k = 0; k < s->nb_temps; k++) {
         ts = &s->temps[k];
-        if (ts->val_type == TEMP_VAL_REG && !ts->fixed_reg
+        if (ts->val_type == TEMP_VAL_REG
+            && ts->kind != TEMP_FIXED
             && s->reg_to_temp[ts->reg] != ts) {
             printf("Inconsistency for temp %s:\n",
                    tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
    mark it free; otherwise mark it dead.  */
 static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 {
-    if (ts->fixed_reg) {
+    if (ts->kind == TEMP_FIXED) {
         return;
     }
     if (ts->val_type == TEMP_VAL_REG) {
         s->reg_to_temp[ts->reg] = NULL;
     }
     ts->val_type = (free_or_dead < 0
-                    || ts->temp_local
-                    || ts->temp_global
+                    || ts->kind != TEMP_NORMAL
                     ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
 static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                       TCGRegSet preferred_regs, int free_or_dead)
 {
-    if (ts->fixed_reg) {
+    if (ts->kind == TEMP_FIXED) {
         return;
     }
     if (!ts->mem_coherent) {
@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
 {
     /* The liveness analysis already ensures that globals are back
        in memory. Keep an tcg_debug_assert for safety. */
-    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg);
+    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
+                     || ts->kind == TEMP_FIXED);
 }
 
 /* save globals to their canonical location and assume they can be
@@ -XXX,XX +XXX,XX @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
     for (i = 0, n = s->nb_globals; i < n; i++) {
         TCGTemp *ts = &s->temps[i];
         tcg_debug_assert(ts->val_type != TEMP_VAL_REG
-                         || ts->fixed_reg
+                         || ts->kind == TEMP_FIXED
                          || ts->mem_coherent);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 
     for (i = s->nb_globals; i < s->nb_temps; i++) {
         TCGTemp *ts = &s->temps[i];
-        if (ts->temp_local) {
+        if (ts->kind == TEMP_LOCAL) {
             temp_save(s, ts, allocated_regs);
         } else {
             /* The liveness analysis already ensures that temps are dead.
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
          * The liveness analysis already ensures that temps are dead.
          * Keep tcg_debug_asserts for safety.
          */
-        if (ts->temp_local) {
+        if (ts->kind == TEMP_LOCAL) {
             tcg_debug_assert(ts->val_type != TEMP_VAL_REG || ts->mem_coherent);
         } else {
             tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                   TCGRegSet preferred_regs)
 {
     /* ENV should not be modified.  */
-    tcg_debug_assert(!ots->fixed_reg);
+    tcg_debug_assert(ots->kind != TEMP_FIXED);
 
     /* The movi is not explicitly generated here.  */
     if (ots->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
     ts = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(!ots->fixed_reg);
+    tcg_debug_assert(ots->kind != TEMP_FIXED);
 
     /* Note that otype != itype for no-op truncation.  */
     otype = ots->type;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
         }
         temp_dead(s, ots);
     } else {
-        if (IS_DEAD_ARG(1) && !ts->fixed_reg) {
+        if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
             /* the mov can be suppressed */
             if (ots->val_type == TEMP_VAL_REG) {
                 s->reg_to_temp[ots->reg] = NULL;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                  * Store the source register into the destination slot
                  * and leave the destination temp as TEMP_VAL_MEM.
                  */
-                assert(!ots->fixed_reg);
+                assert(ots->kind != TEMP_FIXED);
                 if (!ts->mem_allocated) {
                     temp_allocate_frame(s, ots);
                 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
     its = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(!ots->fixed_reg);
+    tcg_debug_assert(ots->kind != TEMP_FIXED);
 
     itype = its->type;
     vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         i_preferred_regs = o_preferred_regs = 0;
         if (arg_ct->ialias) {
             o_preferred_regs = op->output_pref[arg_ct->alias_index];
-            if (ts->fixed_reg) {
+            if (ts->kind == TEMP_FIXED) {
                 /* if fixed register, we must allocate a new register
                    if the alias is not the same register */
                 if (arg != op->args[arg_ct->alias_index]) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             ts = arg_temp(arg);
 
             /* ENV should not be modified.  */
-            tcg_debug_assert(!ts->fixed_reg);
+            tcg_debug_assert(ts->kind != TEMP_FIXED);
 
             if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                 reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         ts = arg_temp(op->args[i]);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(!ts->fixed_reg);
+        tcg_debug_assert(ts->kind != TEMP_FIXED);
 
         if (NEED_SYNC_ARG(i)) {
             temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         ts = arg_temp(arg);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(!ts->fixed_reg);
+        tcg_debug_assert(ts->kind != TEMP_FIXED);
 
         reg = tcg_target_call_oarg_regs[i];
         tcg_debug_assert(s->reg_to_temp[reg] == NULL);
-- 
2.25.1

In most, but not all, places that we check for TEMP_FIXED,
we are really testing that we do not modify the temporary.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  5 +++++
 tcg/tcg.c         | 21 ++++++++++-----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
 };
 
+static inline bool temp_readonly(TCGTemp *ts)
+{
+    return ts->kind == TEMP_FIXED;
+}
+
 extern TCGContext tcg_init_ctx;
 extern __thread TCGContext *tcg_ctx;
 extern const void *tcg_code_gen_epilogue;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
    mark it free; otherwise mark it dead.  */
 static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 {
-    if (ts->kind == TEMP_FIXED) {
+    if (temp_readonly(ts)) {
         return;
     }
     if (ts->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
 static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                       TCGRegSet preferred_regs, int free_or_dead)
 {
-    if (ts->kind == TEMP_FIXED) {
+    if (temp_readonly(ts)) {
         return;
     }
     if (!ts->mem_coherent) {
@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
 {
     /* The liveness analysis already ensures that globals are back
        in memory. Keep an tcg_debug_assert for safety. */
-    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
-                     || ts->kind == TEMP_FIXED);
+    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || temp_readonly(ts));
 }
 
 /* save globals to their canonical location and assume they can be
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                   TCGRegSet preferred_regs)
 {
     /* ENV should not be modified.  */
-    tcg_debug_assert(ots->kind != TEMP_FIXED);
+    tcg_debug_assert(!temp_readonly(ots));
 
     /* The movi is not explicitly generated here.  */
     if (ots->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
     ts = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(ots->kind != TEMP_FIXED);
+    tcg_debug_assert(!temp_readonly(ots));
 
     /* Note that otype != itype for no-op truncation.  */
     otype = ots->type;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                  * Store the source register into the destination slot
                  * and leave the destination temp as TEMP_VAL_MEM.
                  */
-                assert(ots->kind != TEMP_FIXED);
+                assert(!temp_readonly(ots));
                 if (!ts->mem_allocated) {
                     temp_allocate_frame(s, ots);
                 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
     its = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(ots->kind != TEMP_FIXED);
+    tcg_debug_assert(!temp_readonly(ots));
 
     itype = its->type;
     vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             ts = arg_temp(arg);
 
             /* ENV should not be modified.  */
-            tcg_debug_assert(ts->kind != TEMP_FIXED);
+            tcg_debug_assert(!temp_readonly(ts));
 
             if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                 reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         ts = arg_temp(op->args[i]);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(ts->kind != TEMP_FIXED);
+        tcg_debug_assert(!temp_readonly(ts));
 
         if (NEED_SYNC_ARG(i)) {
             temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         ts = arg_temp(arg);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(ts->kind != TEMP_FIXED);
+        tcg_debug_assert(!temp_readonly(ts));
 
         reg = tcg_target_call_oarg_regs[i];
         tcg_debug_assert(s->reg_to_temp[reg] == NULL);
-- 
2.25.1

This will reduce the differences between 32-bit and 64-bit hosts,
allowing full 64-bit constants to be created with the same interface.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 2 +-
 tcg/tcg.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTemp {
     unsigned int mem_allocated:1;
     unsigned int temp_allocated:1;
 
-    tcg_target_long val;
+    int64_t val;
     struct TCGTemp *mem_base;
     intptr_t mem_offset;
     const char *name;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void dump_regs(TCGContext *s)
                    tcg_target_reg_names[ts->mem_base->reg]);
             break;
         case TEMP_VAL_CONST:
-            printf("$0x%" TCG_PRIlx, ts->val);
+            printf("$0x%" PRIx64, ts->val);
             break;
         case TEMP_VAL_DEAD:
             printf("D");
-- 
2.25.1

Fix this name vs our coding style.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
         glue(glue(case INDEX_op_, x), _i64):    \
         glue(glue(case INDEX_op_, x), _vec)
 
-struct tcg_temp_info {
+typedef struct TempOptInfo {
     bool is_const;
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     tcg_target_ulong val;
     tcg_target_ulong mask;
-};
+} TempOptInfo;
 
-static inline struct tcg_temp_info *ts_info(TCGTemp *ts)
+static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
 }
 
-static inline struct tcg_temp_info *arg_info(TCGArg arg)
+static inline TempOptInfo *arg_info(TCGArg arg)
 {
     return ts_info(arg_temp(arg));
 }
@@ -XXX,XX +XXX,XX @@ static inline bool ts_is_copy(TCGTemp *ts)
 /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
 static void reset_ts(TCGTemp *ts)
 {
-    struct tcg_temp_info *ti = ts_info(ts);
-    struct tcg_temp_info *pi = ts_info(ti->prev_copy);
-    struct tcg_temp_info *ni = ts_info(ti->next_copy);
+    TempOptInfo *ti = ts_info(ts);
+    TempOptInfo *pi = ts_info(ti->prev_copy);
+    TempOptInfo *ni = ts_info(ti->next_copy);
 
     ni->prev_copy = ti->prev_copy;
     pi->next_copy = ti->next_copy;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(struct tcg_temp_info *infos,
+static void init_ts_info(TempOptInfo *infos,
                          TCGTempSet *temps_used, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     if (!test_bit(idx, temps_used->l)) {
-        struct tcg_temp_info *ti = &infos[idx];
+        TempOptInfo *ti = &infos[idx];
 
         ts->state_ptr = ti;
         ti->next_copy = ts;
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(struct tcg_temp_info *infos,
     }
 }
 
-static void init_arg_info(struct tcg_temp_info *infos,
+static void init_arg_info(TempOptInfo *infos,
                           TCGTempSet *temps_used, TCGArg arg)
 {
     init_ts_info(infos, temps_used, arg_temp(arg));
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
     const TCGOpDef *def;
     TCGOpcode new_op;
     tcg_target_ulong mask;
-    struct tcg_temp_info *di = arg_info(dst);
+    TempOptInfo *di = arg_info(dst);
 
     def = &tcg_op_defs[op->opc];
     if (def->flags & TCG_OPF_VECTOR) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
     const TCGOpDef *def;
-    struct tcg_temp_info *di;
-    struct tcg_temp_info *si;
+    TempOptInfo *di;
+    TempOptInfo *si;
     tcg_target_ulong mask;
     TCGOpcode new_op;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     di->mask = mask;
 
     if (src_ts->type == dst_ts->type) {
-        struct tcg_temp_info *ni = ts_info(si->next_copy);
+        TempOptInfo *ni = ts_info(si->next_copy);
 
         di->next_copy = si->next_copy;
         di->prev_copy = src_ts;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    struct tcg_temp_info *infos;
+    TempOptInfo *infos;
     TCGTempSet temps_used;
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
     bitmap_zero(temps_used.l, nb_temps);
-    infos = tcg_malloc(sizeof(struct tcg_temp_info) * nb_temps);
+    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         tcg_target_ulong mask, partmask, affected;
-- 
2.25.1

This propagates the extended value of TCGTemp.val that we did before.
In addition, it will be required for vector constants.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     bool is_const;
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
-    tcg_target_ulong val;
-    tcg_target_ulong mask;
+    uint64_t val;
+    uint64_t mask;
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def;
     TCGOpcode new_op;
-    tcg_target_ulong mask;
+    uint64_t mask;
     TempOptInfo *di = arg_info(dst);
 
     def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    tcg_target_ulong mask;
+    uint64_t mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
+static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
 {
     uint64_t l64, h64;
 
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
     }
 }
 
-static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
+static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 {
     const TCGOpDef *def = &tcg_op_defs[op];
-    TCGArg res = do_constant_folding_2(op, x, y);
+    uint64_t res = do_constant_folding_2(op, x, y);
     if (!(def->flags & TCG_OPF_64BIT)) {
         res = (int32_t)res;
     }
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
                                        TCGArg y, TCGCond c)
 {
-    tcg_target_ulong xv = arg_info(x)->val;
-    tcg_target_ulong yv = arg_info(y)->val;
+    uint64_t xv = arg_info(x)->val;
+    uint64_t yv = arg_info(y)->val;
+
     if (arg_is_const(x) && arg_is_const(y)) {
         const TCGOpDef *def = &tcg_op_defs[op];
         tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        tcg_target_ulong mask, partmask, affected;
+        uint64_t mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs, i;
-        TCGArg tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                TCGArg v1 = arg_info(op->args[1])->val;
-                TCGArg v2 = arg_info(op->args[2])->val;
+                uint64_t v1 = arg_info(op->args[1])->val;
+                uint64_t v2 = arg_info(op->args[2])->val;
+                int shr = op->args[3];
 
                 if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> op->args[3]) | (v2 << (64 - op->args[3]));
+                    tmp = (v1 >> shr) | (v2 << (64 - shr));
                 } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> op->args[3]) |
-                                    ((uint32_t)v2 << (32 - op->args[3])));
+                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                    ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, op, op->args[0], tmp);
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 break;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                tcg_target_ulong tv = arg_info(op->args[3])->val;
-                tcg_target_ulong fv = arg_info(op->args[4])->val;
+                uint64_t tv = arg_info(op->args[3])->val;
+                uint64_t fv = arg_info(op->args[4])->val;
                 TCGCond cond = op->args[5];
+
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-- 
2.25.1

These will hold a single constant for the duration of the TB.
They are hashed, so that each value has one temp across the TB.

Not used yet, this is all infrastructure.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  24 ++++-
 tcg/optimize.c    |  13 ++-
 tcg/tcg.c         | 224 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 211 insertions(+), 50 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempKind {
     TEMP_GLOBAL,
     /* Temp is in a fixed register. */
     TEMP_FIXED,
+    /* Temp is a fixed constant. */
+    TEMP_CONST,
 } TCGTempKind;
 
 typedef struct TCGTemp {
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     QSIMPLEQ_HEAD(, TCGOp) plugin_ops;
 #endif
 
+    GHashTable *const_table[TCG_TYPE_COUNT];
     TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
     TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
 
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
 
 static inline bool temp_readonly(TCGTemp *ts)
 {
-    return ts->kind == TEMP_FIXED;
+    return ts->kind >= TEMP_FIXED;
 }
 
 extern TCGContext tcg_init_ctx;
@@ -XXX,XX +XXX,XX @@ TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc);
 
 void tcg_optimize(TCGContext *s);
 
+/* Allocate a new temporary and initialize it with a constant. */
 TCGv_i32 tcg_const_i32(int32_t val);
 TCGv_i64 tcg_const_i64(int64_t val);
 TCGv_i32 tcg_const_local_i32(int32_t val);
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec(TCGType);
 TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
 TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
 
+/*
+ * Locate or create a read-only temporary that is a constant.
+ * This kind of temporary need not and should not be freed.
+ */
+TCGTemp *tcg_constant_internal(TCGType type, int64_t val);
+
+static inline TCGv_i32 tcg_constant_i32(int32_t val)
+{
+    return temp_tcgv_i32(tcg_constant_internal(TCG_TYPE_I32, val));
+}
+
+static inline TCGv_i64 tcg_constant_i64(int64_t val)
+{
+    return temp_tcgv_i64(tcg_constant_internal(TCG_TYPE_I64, val));
+}
+
+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
+
 #if UINTPTR_MAX == UINT32_MAX
 # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
 # define tcg_const_local_ptr(x)  ((TCGv_ptr)tcg_const_local_i32((intptr_t)(x)))
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TempOptInfo *infos,
         ts->state_ptr = ti;
         ti->next_copy = ts;
         ti->prev_copy = ts;
-        ti->is_const = false;
-        ti->mask = -1;
+        if (ts->kind == TEMP_CONST) {
+            ti->is_const = true;
+            ti->val = ti->mask = ts->val;
+            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+                /* High bits of a 32-bit quantity are garbage.  */
+                ti->mask |= ~0xffffffffull;
+            }
+        } else {
+            ti->is_const = false;
+            ti->mask = -1;
+        }
         set_bit(idx, temps_used->l);
     }
 }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_func_start(TCGContext *s)
     /* No temps have been previously allocated for size or locality.  */
     memset(s->free_temps, 0, sizeof(s->free_temps));
 
+    /* No constant temps have been previously allocated. */
+    for (int i = 0; i < TCG_TYPE_COUNT; ++i) {
+        if (s->const_table[i]) {
+            g_hash_table_remove_all(s->const_table[i]);
+        }
+    }
+
     s->nb_ops = 0;
     s->nb_labels = 0;
     s->current_frame_offset = s->frame_start;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
     bigendian = 1;
 #endif
 
-    if (base_ts->kind != TEMP_FIXED) {
+    switch (base_ts->kind) {
+    case TEMP_FIXED:
+        break;
+    case TEMP_GLOBAL:
         /* We do not support double-indirect registers.  */
         tcg_debug_assert(!base_ts->indirect_reg);
         base_ts->indirect_base = 1;
         s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64
                             ? 2 : 1);
         indirect_reg = 1;
+        break;
+    default:
+        g_assert_not_reached();
     }
 
     if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
     TCGContext *s = tcg_ctx;
     int k, idx;
 
+    /* In order to simplify users of tcg_constant_*, silently ignore free. */
+    if (ts->kind == TEMP_CONST) {
+        return;
+    }
+
 #if defined(CONFIG_DEBUG_TCG)
     s->temps_in_use--;
     if (s->temps_in_use < 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
     set_bit(idx, s->free_temps[k].l);
 }
 
+TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
+{
+    TCGContext *s = tcg_ctx;
+    GHashTable *h = s->const_table[type];
+    TCGTemp *ts;
+
+    if (h == NULL) {
+        h = g_hash_table_new(g_int64_hash, g_int64_equal);
+        s->const_table[type] = h;
+    }
+
+    ts = g_hash_table_lookup(h, &val);
+    if (ts == NULL) {
+        ts = tcg_temp_alloc(s);
+
+        if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
+            TCGTemp *ts2 = tcg_temp_alloc(s);
+
+            ts->base_type = TCG_TYPE_I64;
+            ts->type = TCG_TYPE_I32;
+            ts->kind = TEMP_CONST;
+            ts->temp_allocated = 1;
+            /*
+             * Retain the full value of the 64-bit constant in the low
+             * part, so that the hash table works.  Actual uses will
+             * truncate the value to the low part.
+             */
+            ts->val = val;
+
+            tcg_debug_assert(ts2 == ts + 1);
+            ts2->base_type = TCG_TYPE_I64;
+            ts2->type = TCG_TYPE_I32;
+            ts2->kind = TEMP_CONST;
+            ts2->temp_allocated = 1;
+            ts2->val = val >> 32;
+        } else {
+            ts->base_type = type;
+            ts->type = type;
+            ts->kind = TEMP_CONST;
+            ts->temp_allocated = 1;
+            ts->val = val;
+        }
+        g_hash_table_insert(h, &ts->val, ts);
+    }
+
+    return ts;
+}
+
+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
+{
+    val = dup_const(vece, val);
+    return temp_tcgv_vec(tcg_constant_internal(type, val));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
     TCGv_i32 t0;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_start(TCGContext *s)
         TCGTempVal val = TEMP_VAL_MEM;
 
         switch (ts->kind) {
+        case TEMP_CONST:
+            val = TEMP_VAL_CONST;
+            break;
         case TEMP_FIXED:
             val = TEMP_VAL_REG;
             break;
@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
     case TEMP_NORMAL:
         snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
         break;
+    case TEMP_CONST:
+        switch (ts->type) {
+        case TCG_TYPE_I32:
+            snprintf(buf, buf_size, "$0x%x", (int32_t)ts->val);
+            break;
+#if TCG_TARGET_REG_BITS > 32
+        case TCG_TYPE_I64:
+            snprintf(buf, buf_size, "$0x%" PRIx64, ts->val);
+            break;
+#endif
+        case TCG_TYPE_V64:
+        case TCG_TYPE_V128:
+        case TCG_TYPE_V256:
+            snprintf(buf, buf_size, "v%d$0x%" PRIx64,
+                     64 << (ts->type - TCG_TYPE_V64), ts->val);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
     }
     return buf;
 }
@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
             state = TS_DEAD | TS_MEM;
             break;
         case TEMP_NORMAL:
+        case TEMP_CONST:
             state = TS_DEAD;
             break;
         default:
@@ -XXX,XX +XXX,XX @@ static void la_bb_sync(TCGContext *s, int ng, int nt)
     la_global_sync(s, ng);
 
     for (int i = ng; i < nt; ++i) {
-        if (s->temps[i].kind == TEMP_LOCAL) {
-            int state = s->temps[i].state;
-            s->temps[i].state = state | TS_MEM;
+        TCGTemp *ts = &s->temps[i];
+        int state;
+
+        switch (ts->kind) {
+        case TEMP_LOCAL:
+            state = ts->state;
+            ts->state = state | TS_MEM;
             if (state != TS_DEAD) {
                 continue;
             }
-        } else {
+            break;
+        case TEMP_NORMAL:
             s->temps[i].state = TS_DEAD;
+            break;
+        case TEMP_CONST:
+            continue;
+        default:
+            g_assert_not_reached();
         }
         la_reset_pref(&s->temps[i]);
     }
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
    mark it free; otherwise mark it dead.  */
 static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 {
-    if (temp_readonly(ts)) {
+    TCGTempVal new_type;
+
+    switch (ts->kind) {
+    case TEMP_FIXED:
         return;
+    case TEMP_GLOBAL:
+    case TEMP_LOCAL:
+        new_type = TEMP_VAL_MEM;
+        break;
+    case TEMP_NORMAL:
+        new_type = free_or_dead < 0 ? TEMP_VAL_MEM : TEMP_VAL_DEAD;
+        break;
+    case TEMP_CONST:
+        new_type = TEMP_VAL_CONST;
+        break;
+    default:
+        g_assert_not_reached();
     }
     if (ts->val_type == TEMP_VAL_REG) {
         s->reg_to_temp[ts->reg] = NULL;
     }
-    ts->val_type = (free_or_dead < 0
-                    || ts->kind != TEMP_NORMAL
-                    ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
+    ts->val_type = new_type;
 }
 
 /* Mark a temporary as dead.  */
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
 static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                       TCGRegSet preferred_regs, int free_or_dead)
 {
-    if (temp_readonly(ts)) {
-        return;
-    }
-    if (!ts->mem_coherent) {
+    if (!temp_readonly(ts) && !ts->mem_coherent) {
         if (!ts->mem_allocated) {
             temp_allocate_frame(s, ts);
         }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 
     for (i = s->nb_globals; i < s->nb_temps; i++) {
         TCGTemp *ts = &s->temps[i];
-        if (ts->kind == TEMP_LOCAL) {
+
+        switch (ts->kind) {
+        case TEMP_LOCAL:
             temp_save(s, ts, allocated_regs);
-        } else {
+            break;
+        case TEMP_NORMAL:
             /* The liveness analysis already ensures that temps are dead.
                Keep an tcg_debug_assert for safety. */
             tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
+            break;
+        case TEMP_CONST:
+            /* Similarly, we should have freed any allocated register. */
+            tcg_debug_assert(ts->val_type == TEMP_VAL_CONST);
+            break;
+        default:
+            g_assert_not_reached();
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
          * The liveness analysis already ensures that temps are dead.
          * Keep tcg_debug_asserts for safety.
          */
-        if (ts->kind == TEMP_LOCAL) {
+        switch (ts->kind) {
+        case TEMP_LOCAL:
             tcg_debug_assert(ts->val_type != TEMP_VAL_REG || ts->mem_coherent);
-        } else {
+            break;
+        case TEMP_NORMAL:
             tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
+            break;
+        case TEMP_CONST:
+            break;
+        default:
+            g_assert_not_reached();
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         i_preferred_regs = o_preferred_regs = 0;
         if (arg_ct->ialias) {
             o_preferred_regs = op->output_pref[arg_ct->alias_index];
-            if (ts->kind == TEMP_FIXED) {
-                /* if fixed register, we must allocate a new register
-                   if the alias is not the same register */
-                if (arg != op->args[arg_ct->alias_index]) {
-                    goto allocate_in_reg;
-                }
-            } else {
-                /* if the input is aliased to an output and if it is
-                   not dead after the instruction, we must allocate
-                   a new register and move it */
-                if (!IS_DEAD_ARG(i)) {
-                    goto allocate_in_reg;
-                }
 
-                /* check if the current register has already been allocated
-                   for another input aliased to an output */
-                if (ts->val_type == TEMP_VAL_REG) {
-                    int k2, i2;
-                    reg = ts->reg;
-                    for (k2 = 0 ; k2 < k ; k2++) {
-                        i2 = def->args_ct[nb_oargs + k2].sort_index;
-                        if (def->args_ct[i2].ialias && reg == new_args[i2]) {
-                            goto allocate_in_reg;
-                        }
+            /*
+             * If the input is readonly, then it cannot also be an
+             * output and aliased to itself.  If the input is not
+             * dead after the instruction, we must allocate a new
+             * register and move it.
+             */
+            if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
+                goto allocate_in_reg;
+            }
+
+            /*
+             * Check if the current register has already been allocated
+             * for another input aliased to an output.
+             */
+            if (ts->val_type == TEMP_VAL_REG) {
+                reg = ts->reg;
+                for (int k2 = 0; k2 < k; k2++) {
+                    int i2 = def->args_ct[nb_oargs + k2].sort_index;
+                    if (def->args_ct[i2].ialias && reg == new_args[i2]) {
+                        goto allocate_in_reg;
                     }
                 }
-                i_preferred_regs = o_preferred_regs;
             }
+            i_preferred_regs = o_preferred_regs;
         }
 
         temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
         reg = ts->reg;
 
-        if (tcg_regset_test_reg(arg_ct->regs, reg)) {
-            /* nothing to do : the constraint is satisfied */
-        } else {
-        allocate_in_reg:
-            /* allocate a new register matching the constraint 
-               and move the temporary register into it */
+        if (!tcg_regset_test_reg(arg_ct->regs, reg)) {
+ allocate_in_reg:
+            /*
+             * Allocate a new register matching the constraint
+             * and move the temporary register into it.
+             */
             temp_load(s, ts, tcg_target_available_regs[ts->type],
                       i_allocated_regs, 0);
             reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
-- 
2.25.1

Prefer TEMP_CONST over anything else.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arg_info(TempOptInfo *infos,
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
-    TCGTemp *i;
+    TCGTemp *i, *g, *l;
 
-    /* If this is already a global, we can't do better. */
-    if (ts->kind >= TEMP_GLOBAL) {
+    /* If this is already readonly, we can't do better. */
+    if (temp_readonly(ts)) {
         return ts;
     }
 
-    /* Search for a global first. */
+    g = l = NULL;
     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-        if (i->kind >= TEMP_GLOBAL) {
+        if (temp_readonly(i)) {
             return i;
-        }
-    }
-
-    /* If it is a temp, search for a temp local. */
-    if (ts->kind == TEMP_NORMAL) {
-        for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-            if (i->kind >= TEMP_LOCAL) {
-                return i;
+        } else if (i->kind > ts->kind) {
+            if (i->kind == TEMP_GLOBAL) {
+                g = i;
+            } else if (i->kind == TEMP_LOCAL) {
+                l = i;
             }
         }
     }
 
-    /* Failure to find a better representation, return the same temp. */
-    return ts;
+    /* If we didn't find a better representation, return the same temp. */
+    return g ? g : l ? l : ts;
 }
 
 static bool ts_are_copies(TCGTemp *ts1, TCGTemp *ts2)
-- 
2.25.1

Do not allocate a large block for indexing.  Instead, allocate
for each temporary as they are seen.

In general, this will use less memory, if we consider that most
TBs do not touch every target register.  This also allows us to
allocate TempOptInfo for new temps created during optimization.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 60 ++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TempOptInfo *infos,
-                         TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
-    if (!test_bit(idx, temps_used->l)) {
-        TempOptInfo *ti = &infos[idx];
+    TempOptInfo *ti;
 
+    if (test_bit(idx, temps_used->l)) {
+        return;
+    }
+    set_bit(idx, temps_used->l);
+
+    ti = ts->state_ptr;
+    if (ti == NULL) {
+        ti = tcg_malloc(sizeof(TempOptInfo));
         ts->state_ptr = ti;
-        ti->next_copy = ts;
-        ti->prev_copy = ts;
-        if (ts->kind == TEMP_CONST) {
-            ti->is_const = true;
-            ti->val = ti->mask = ts->val;
-            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-                /* High bits of a 32-bit quantity are garbage.  */
-                ti->mask |= ~0xffffffffull;
-            }
-        } else {
-            ti->is_const = false;
-            ti->mask = -1;
+    }
+
+    ti->next_copy = ts;
+    ti->prev_copy = ts;
+    if (ts->kind == TEMP_CONST) {
+        ti->is_const = true;
+        ti->val = ts->val;
+        ti->mask = ts->val;
+        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+            /* High bits of a 32-bit quantity are garbage.  */
+            ti->mask |= ~0xffffffffull;
         }
-        set_bit(idx, temps_used->l);
+    } else {
+        ti->is_const = false;
+        ti->mask = -1;
     }
 }
 
-static void init_arg_info(TempOptInfo *infos,
-                          TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
 {
-    init_ts_info(infos, temps_used, arg_temp(arg));
+    init_ts_info(temps_used, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals;
+    int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TempOptInfo *infos;
     TCGTempSet temps_used;
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
+
     bitmap_zero(temps_used.l, nb_temps);
-    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
+    for (i = 0; i < nb_temps; ++i) {
+        s->temps[i].state_ptr = NULL;
+    }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs, i;
+        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(infos, &temps_used, ts);
+                    init_ts_info(&temps_used, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(infos, &temps_used, op->args[i]);
+                init_arg_info(&temps_used, op->args[i]);
             }
         }
 
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 108 ++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 59 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
-{
-    const TCGOpDef *def;
-    TCGOpcode new_op;
-    uint64_t mask;
-    TempOptInfo *di = arg_info(dst);
-
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_dupi_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_movi_i64;
-    } else {
-        new_op = INDEX_op_movi_i32;
-    }
-    op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
-    op->args[0] = dst;
-    op->args[1] = val;
-
-    reset_temp(dst);
-    di->is_const = true;
-    di->val = val;
-    mask = val;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
-        /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
-    }
-    di->mask = mask;
-}
-
 static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
+static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+                             TCGOp *op, TCGArg dst, uint64_t val)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    TCGType type;
+    TCGTemp *tv;
+
+    if (def->flags & TCG_OPF_VECTOR) {
+        type = TCGOP_VECL(op) + TCG_TYPE_V64;
+    } else if (def->flags & TCG_OPF_64BIT) {
+        type = TCG_TYPE_I64;
+    } else {
+        type = TCG_TYPE_I32;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(type, val);
+    init_ts_info(temps_used, tv);
+    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+}
+
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
 {
     uint64_t l64, h64;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    bitmap_zero(temps_used.l, nb_temps);
+    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         CASE_OP_32_64(movi):
         case INDEX_op_dupi_vec:
-            tcg_opt_gen_movi(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
             break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[1])->val;
                 if (tmp == arg_info(op->args[2])->val) {
-                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                     break;
                 }
             } else if (args_are_copies(op->args[1], op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    bitmap_zero(temps_used.l, nb_temps);
+                    memset(&temps_used, 0, sizeof(temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 uint64_t a = ((uint64_t)ah << 32) | al;
                 uint64_t b = ((uint64_t)bh << 32) | bl;
                 TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
+                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
 
                 if (opc == INDEX_op_add2_i32) {
                     a += b;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 uint32_t b = arg_info(op->args[3])->val;
                 uint64_t r = (uint64_t)a * b;
                 TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
+                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    bitmap_zero(temps_used.l, nb_temps);
+                    memset(&temps_used, 0, sizeof(temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                bitmap_zero(temps_used.l, nb_temps);
+                memset(&temps_used, 0, sizeof(temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                bitmap_zero(temps_used.l, nb_temps);
+                memset(&temps_used, 0, sizeof(temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                bitmap_zero(temps_used.l, nb_temps);
+                memset(&temps_used, 0, sizeof(temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Because we now store uint64_t in TCGTemp, we can now always
store the full 64-bit duplicate immediate.  So remove the
difference between 32- and 64-bit hosts.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c   |  9 ++++-----
 tcg/tcg-op-vec.c | 39 ++++++++++-----------------------------
 tcg/tcg.c        |  7 +------
 3 files changed, 15 insertions(+), 40 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[1])->val;
-                if (tmp == arg_info(op->args[2])->val) {
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-                    break;
-                }
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+                                           arg_info(op->args[2])->val));
+                break;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
     }
 }
 
-#define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
-
-static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
-{
-    TCGTemp *rt = tcgv_vec_temp(r);
-    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
-}
-
 TCGv_vec tcg_const_zeros_vec(TCGType type)
 {
     TCGv_vec ret = tcg_temp_new_vec(type);
-    do_dupi_vec(ret, MO_REG, 0);
+    tcg_gen_dupi_vec(MO_64, ret, 0);
     return ret;
 }
 
 TCGv_vec tcg_const_ones_vec(TCGType type)
 {
     TCGv_vec ret = tcg_temp_new_vec(type);
-    do_dupi_vec(ret, MO_REG, -1);
+    tcg_gen_dupi_vec(MO_64, ret, -1);
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
 
 void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
 {
-    if (TCG_TARGET_REG_BITS == 64) {
-        do_dupi_vec(r, MO_64, a);
-    } else if (a == dup_const(MO_32, a)) {
-        do_dupi_vec(r, MO_32, a);
-    } else {
-        TCGv_i64 c = tcg_const_i64(a);
-        tcg_gen_dup_i64_vec(MO_64, r, c);
-        tcg_temp_free_i64(c);
-    }
+    tcg_gen_dupi_vec(MO_64, r, a);
 }
 
 void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
+    tcg_gen_dupi_vec(MO_32, r, a);
 }
 
 void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
+    tcg_gen_dupi_vec(MO_16, r, a);
 }
 
 void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
+    tcg_gen_dupi_vec(MO_8, r, a);
 }
 
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
 {
-    if (vece == MO_64) {
-        tcg_gen_dup64i_vec(r, a);
-    } else {
-        do_dupi_vec(r, MO_REG, dup_const(vece, a));
-    }
+    TCGTemp *rt = tcgv_vec_temp(r);
+    tcg_gen_mov_vec(r, tcg_constant_vec(rt->base_type, vece, a));
 }
 
 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
             if (tcg_can_emit_vec_op(INDEX_op_sari_vec, type, vece) > 0) {
                 tcg_gen_sari_vec(vece, t, a, (8 << vece) - 1);
             } else {
-                do_dupi_vec(t, MO_REG, 0);
-                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a, t);
+                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a,
+                                tcg_constant_vec(type, vece, 0));
             }
             tcg_gen_xor_vec(vece, r, a, t);
             tcg_gen_sub_vec(vece, r, r, t);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
              * The targets will, in general, have to do this search anyway,
              * do this generically.
              */
-            if (TCG_TARGET_REG_BITS == 32) {
-                val = dup_const(MO_32, val);
-                vece = MO_32;
-            }
             if (val == dup_const(MO_8, val)) {
                 vece = MO_8;
             } else if (val == dup_const(MO_16, val)) {
                 vece = MO_16;
-            } else if (TCG_TARGET_REG_BITS == 64 &&
-                       val == dup_const(MO_32, val)) {
+            } else if (val == dup_const(MO_32, val)) {
                 vece = MO_32;
             }
 
-- 
2.25.1

We must do this before we adjust tcg_out_movi_i32, lest the
under-the-hood poking that we do for icount be broken.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/gen-icount.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -XXX,XX +XXX,XX @@ static inline void gen_io_end(void)
 
 static inline void gen_tb_start(const TranslationBlock *tb)
 {
-    TCGv_i32 count, imm;
+    TCGv_i32 count;
 
     tcg_ctx->exitreq_label = gen_new_label();
     if (tb_cflags(tb) & CF_USE_ICOUNT) {
@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(const TranslationBlock *tb)
                    offsetof(ArchCPU, env));
 
     if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        imm = tcg_temp_new_i32();
-        /* We emit a movi with a dummy immediate argument. Keep the insn index
-         * of the movi so that we later (when we know the actual insn count)
-         * can update the immediate argument with the actual insn count.  */
-        tcg_gen_movi_i32(imm, 0xdeadbeef);
+        /*
+         * We emit a sub with a dummy immediate argument. Keep the insn index
+         * of the sub so that we later (when we know the actual insn count)
+         * can update the argument with the actual insn count.
+         */
+        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
         icount_start_insn = tcg_last_op();
-
-        tcg_gen_sub_i32(count, count, imm);
-        tcg_temp_free_i32(imm);
     }
 
     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(const TranslationBlock *tb)
 static inline void gen_tb_end(const TranslationBlock *tb, int num_insns)
 {
     if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        /* Update the num_insn immediate parameter now that we know
-         * the actual insn count.  */
-        tcg_set_insn_param(icount_start_insn, 1, num_insns);
+        /*
+         * Update the num_insn immediate parameter now that we know
+         * the actual insn count.
+         */
+        tcg_set_insn_param(icount_start_insn, 2,
+                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
     }
 
     gen_set_label(tcg_ctx->exitreq_label);
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h |  13 +--
 tcg/tcg-op.c         | 227 ++++++++++++++++++++-----------------------
 2 files changed, 109 insertions(+), 131 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar);
 
 /* 32 bit ops */
 
+void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
 void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
 void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
 void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
     }
 }
 
-static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
-{
-    tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
-}
-
 static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
                                     tcg_target_long offset)
 {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
 
 /* 64 bit ops */
 
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
 void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
 void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
     }
 }
 
-static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
-{
-    tcg_gen_op2i_i64(INDEX_op_movi_i64, ret, arg);
-}
-
 static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
                                     tcg_target_long offset)
 {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 
 void tcg_gen_discard_i64(TCGv_i64 arg);
 void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
 void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
 void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
 void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar mb_type)
 
 /* 32 bit ops */
 
+void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
+{
+    tcg_gen_mov_i32(ret, tcg_constant_i32(arg));
+}
+
 void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_add_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_add_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
         /* Don't recurse with tcg_gen_neg_i32.  */
         tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg2);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg1);
-        tcg_gen_sub_i32(ret, t0, arg2);
-        tcg_temp_free_i32(t0);
+        tcg_gen_sub_i32(ret, tcg_constant_i32(arg1), arg2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_sub_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
 void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
-    TCGv_i32 t0;
     /* Some cases can be optimized here.  */
     switch (arg2) {
     case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
         }
         break;
     }
-    t0 = tcg_const_i32(arg2);
-    tcg_gen_and_i32(ret, arg1, t0);
-    tcg_temp_free_i32(t0);
+
+    tcg_gen_and_i32(ret, arg1, tcg_constant_i32(arg2));
 }
 
 void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_or_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_or_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
         /* Don't recurse with tcg_gen_not_i32.  */
         tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_xor_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_xor_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_shl_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shr_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_shr_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sar_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_sar_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *l)
     if (cond == TCG_COND_ALWAYS) {
         tcg_gen_br(l);
     } else if (cond != TCG_COND_NEVER) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_brcond_i32(cond, arg1, t0, l);
-        tcg_temp_free_i32(t0);
+        tcg_gen_brcond_i32(cond, arg1, tcg_constant_i32(arg2), l);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
 void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
                           TCGv_i32 arg1, int32_t arg2)
 {
-    TCGv_i32 t0 = tcg_const_i32(arg2);
-    tcg_gen_setcond_i32(cond, ret, arg1, t0);
-    tcg_temp_free_i32(t0);
+    tcg_gen_setcond_i32(cond, ret, arg1, tcg_constant_i32(arg2));
 }
 
 void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     } else if (is_power_of_2(arg2)) {
         tcg_gen_shli_i32(ret, arg1, ctz32(arg2));
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_mul_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_mul_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 
 void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
 {
-    TCGv_i32 t = tcg_const_i32(arg2);
-    tcg_gen_clz_i32(ret, arg1, t);
-    tcg_temp_free_i32(t);
+    tcg_gen_clz_i32(ret, arg1, tcg_constant_i32(arg2));
 }
 
 void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
             tcg_gen_clzi_i32(t, t, 32);
             tcg_gen_xori_i32(t, t, 31);
         }
-        z = tcg_const_i32(0);
+        z = tcg_constant_i32(0);
         tcg_gen_movcond_i32(TCG_COND_EQ, ret, arg1, z, arg2, t);
         tcg_temp_free_i32(t);
-        tcg_temp_free_i32(z);
     } else {
         gen_helper_ctz_i32(ret, arg1, arg2);
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
         tcg_gen_ctpop_i32(ret, t);
         tcg_temp_free_i32(t);
     } else {
-        TCGv_i32 t = tcg_const_i32(arg2);
-        tcg_gen_ctz_i32(ret, arg1, t);
-        tcg_temp_free_i32(t);
+        tcg_gen_ctz_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else if (TCG_TARGET_HAS_rot_i32) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_rotl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_rotl_i32(ret, arg1, tcg_constant_i32(arg2));
     } else {
         TCGv_i32 t0, t1;
         t0 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
         tcg_gen_andi_i32(ret, arg, (1u << len) - 1);
     } else if (TCG_TARGET_HAS_deposit_i32
                && TCG_TARGET_deposit_i32_valid(ofs, len)) {
-        TCGv_i32 zero = tcg_const_i32(0);
+        TCGv_i32 zero = tcg_constant_i32(0);
         tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, zero, arg, ofs, len);
-        tcg_temp_free_i32(zero);
     } else {
         /* To help two-operand hosts we prefer to zero-extend first,
            which allows ARG to stay live.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
     } else {
         TCGv_i32 t0 = tcg_temp_new_i32();
         TCGv_i32 t1 = tcg_temp_new_i32();
-        TCGv_i32 t2 = tcg_const_i32(0x00ff00ff);
+        TCGv_i32 t2 = tcg_constant_i32(0x00ff00ff);
 
                                         /* arg = abcd */
         tcg_gen_shri_i32(t0, arg, 8);   /*  t0 = .abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
 
         tcg_temp_free_i32(t0);
         tcg_temp_free_i32(t1);
-        tcg_temp_free_i32(t2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_discard_i64(TCGv_i64 arg)
 
 void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
-    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    TCGTemp *ts = tcgv_i64_temp(arg);
+
+    /* Canonicalize TCGv_i64 TEMP_CONST into TCGv_i32 TEMP_CONST. */
+    if (ts->kind == TEMP_CONST) {
+        tcg_gen_movi_i64(ret, ts->val);
+    } else {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    }
 }
 
 void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
     tcg_temp_free_i64(t0);
     tcg_temp_free_i32(t1);
 }
+
+#else
+
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
+{
+    tcg_gen_mov_i64(ret, tcg_constant_i64(arg));
+}
+
 #endif /* TCG_TARGET_REG_SIZE == 32 */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_add_i64(ret, arg1, tcg_constant_i64(arg2));
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_add_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
     if (arg1 == 0 && TCG_TARGET_HAS_neg_i64) {
         /* Don't recurse with tcg_gen_neg_i64.  */
         tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg2);
+    } else if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_sub_i64(ret, tcg_constant_i64(arg1), arg2);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg1);
-        tcg_gen_sub_i64(ret, t0, arg2);
-        tcg_temp_free_i64(t0);
+        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+                         tcg_constant_i32(arg1), tcg_constant_i32(arg1 >> 32),
+                         TCGV_LOW(arg2), TCGV_HIGH(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_sub_i64(ret, arg1, tcg_constant_i64(arg2));
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
     }
 }
 
 void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
-    TCGv_i64 t0;
-
     if (TCG_TARGET_REG_BITS == 32) {
         tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
         tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
         }
         break;
     }
-    t0 = tcg_const_i64(arg2);
-    tcg_gen_and_i64(ret, arg1, t0);
-    tcg_temp_free_i64(t0);
+
+    tcg_gen_and_i64(ret, arg1, tcg_constant_i64(arg2));
 }
 
 void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_or_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_or_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
         /* Don't recurse with tcg_gen_not_i64.  */
         tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_xor_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_xor_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_shl_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shr_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_shr_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sar_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_sar_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *l)
 
 void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *l)
 {
-    if (cond == TCG_COND_ALWAYS) {
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_brcond_i64(cond, arg1, tcg_constant_i64(arg2), l);
+    } else if (cond == TCG_COND_ALWAYS) {
         tcg_gen_br(l);
     } else if (cond != TCG_COND_NEVER) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_brcond_i64(cond, arg1, t0, l);
-        tcg_temp_free_i64(t0);
+        l->refs++;
+        tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
+                          TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                          tcg_constant_i32(arg2),
+                          tcg_constant_i32(arg2 >> 32),
+                          cond, label_arg(l));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
 void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
                           TCGv_i64 arg1, int64_t arg2)
 {
-    TCGv_i64 t0 = tcg_const_i64(arg2);
-    tcg_gen_setcond_i64(cond, ret, arg1, t0);
-    tcg_temp_free_i64(t0);
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_setcond_i64(cond, ret, arg1, tcg_constant_i64(arg2));
+    } else if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_movi_i64(ret, 1);
+    } else if (cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i64(ret, 0);
+    } else {
+        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
+                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                         tcg_constant_i32(arg2),
+                         tcg_constant_i32(arg2 >> 32), cond);
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    }
 }
 
 void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
     } else {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
-        TCGv_i64 t2 = tcg_const_i64(0x00ff00ff);
+        TCGv_i64 t2 = tcg_constant_i64(0x00ff00ff);
 
                                         /* arg = ....abcd */
         tcg_gen_shri_i64(t0, arg, 8);   /*  t0 = .....abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
 
         tcg_temp_free_i64(t0);
         tcg_temp_free_i64(t1);
-        tcg_temp_free_i64(t2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
     if (TCG_TARGET_REG_BITS == 32
         && TCG_TARGET_HAS_clz_i32
         && arg2 <= 0xffffffffu) {
-        TCGv_i32 t = tcg_const_i32((uint32_t)arg2 - 32);
-        tcg_gen_clz_i32(t, TCGV_LOW(arg1), t);
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_clzi_i32(t, TCGV_LOW(arg1), arg2 - 32);
         tcg_gen_addi_i32(t, t, 32);
         tcg_gen_clz_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), t);
         tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
         tcg_temp_free_i32(t);
     } else {
-        TCGv_i64 t = tcg_const_i64(arg2);
-        tcg_gen_clz_i64(ret, arg1, t);
-        tcg_temp_free_i64(t);
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_clz_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
             tcg_gen_clzi_i64(t, t, 64);
             tcg_gen_xori_i64(t, t, 63);
         }
-        z = tcg_const_i64(0);
+        z = tcg_constant_i64(0);
         tcg_gen_movcond_i64(TCG_COND_EQ, ret, arg1, z, arg2, t);
         tcg_temp_free_i64(t);
         tcg_temp_free_i64(z);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
     if (TCG_TARGET_REG_BITS == 32
         && TCG_TARGET_HAS_ctz_i32
         && arg2 <= 0xffffffffu) {
-        TCGv_i32 t32 = tcg_const_i32((uint32_t)arg2 - 32);
-        tcg_gen_ctz_i32(t32, TCGV_HIGH(arg1), t32);
+        TCGv_i32 t32 = tcg_temp_new_i32();
+        tcg_gen_ctzi_i32(t32, TCGV_HIGH(arg1), arg2 - 32);
         tcg_gen_addi_i32(t32, t32, 32);
         tcg_gen_ctz_i32(TCGV_LOW(ret), TCGV_LOW(arg1), t32);
         tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
         tcg_gen_ctpop_i64(ret, t);
         tcg_temp_free_i64(t);
     } else {
-        TCGv_i64 t64 = tcg_const_i64(arg2);
-        tcg_gen_ctz_i64(ret, arg1, t64);
-        tcg_temp_free_i64(t64);
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_ctz_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else if (TCG_TARGET_HAS_rot_i64) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_rotl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_rotl_i64(ret, arg1, tcg_constant_i64(arg2));
     } else {
         TCGv_i64 t0, t1;
         t0 = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
         tcg_gen_andi_i64(ret, arg, (1ull << len) - 1);
     } else if (TCG_TARGET_HAS_deposit_i64
                && TCG_TARGET_deposit_i64_valid(ofs, len)) {
-        TCGv_i64 zero = tcg_const_i64(0);
+        TCGv_i64 zero = tcg_constant_i64(0);
         tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, zero, arg, ofs, len);
-        tcg_temp_free_i64(zero);
     } else {
         if (TCG_TARGET_REG_BITS == 32) {
             if (ofs >= 32) {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
 
 #ifdef CONFIG_SOFTMMU
         {
-            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
-            gen(retv, cpu_env, addr, cmpv, newv, oi);
-            tcg_temp_free_i32(oi);
+            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
+            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
         }
 #else
         gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
 
 #ifdef CONFIG_SOFTMMU
         {
-            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
-            gen(retv, cpu_env, addr, cmpv, newv, oi);
-            tcg_temp_free_i32(oi);
+            TCGMemOpIdx oi = make_memop_idx(memop, idx);
+            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
         }
 #else
         gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
 
 #ifdef CONFIG_SOFTMMU
     {
-        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
-        gen(ret, cpu_env, addr, val, oi);
-        tcg_temp_free_i32(oi);
+        TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
+        gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
     }
 #else
     gen(ret, cpu_env, addr, val);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
 
 #ifdef CONFIG_SOFTMMU
         {
-            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
-            gen(ret, cpu_env, addr, val, oi);
-            tcg_temp_free_i32(oi);
+            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
+            gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
         }
 #else
         gen(ret, cpu_env, addr, val);
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/plugin-gen.c | 49 +++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
     if (TCG_TARGET_REG_BITS == 32) {
         /* mov_i32 */
         op = copy_op(begin_op, op, INDEX_op_mov_i32);
-        /* movi_i32 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+        /* mov_i32 w/ $0 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
     } else {
         /* extu_i32_i64 */
         op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
     return op;
 }
 
-static TCGOp *copy_movi_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        /* 2x movi_i32 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
-        op->args[1] = v;
-
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
-        op->args[1] = v >> 32;
-    } else {
-        /* movi_i64 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i64);
-        op->args[1] = v;
-    }
-    return op;
-}
-
 static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
 {
     if (UINTPTR_MAX == UINT32_MAX) {
-        /* movi_i32 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
-        op->args[1] = (uintptr_t)ptr;
+        /* mov_i32 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op->args[1] = tcgv_i32_arg(tcg_constant_i32((uintptr_t)ptr));
     } else {
-        /* movi_i64 */
-        op = copy_movi_i64(begin_op, op, (uint64_t)(uintptr_t)ptr);
+        /* mov_i64 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i64);
+        op->args[1] = tcgv_i64_arg(tcg_constant_i64((uintptr_t)ptr));
     }
     return op;
 }
 
 static TCGOp *copy_const_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
 {
-    return copy_movi_i64(begin_op, op, v);
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* 2x mov_i32 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v));
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v >> 32));
+    } else {
+        /* mov_i64 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i64);
+        op->args[1] = tcgv_i64_arg(tcg_constant_i64(v));
+    }
+    return op;
 }
 
 static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
 
     tcg_debug_assert(type == PLUGIN_GEN_CB_MEM);
 
-    /* const_i32 == movi_i32 ("info", so it remains as is) */
-    op = copy_op(&begin_op, op, INDEX_op_movi_i32);
+    /* const_i32 == mov_i32 ("info", so it remains as is) */
+    op = copy_op(&begin_op, op, INDEX_op_mov_i32);
 
     /* const_ptr */
     op = copy_const_ptr(&begin_op, op, cb->userp);
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |   1 +
 tcg/tcg-op-gvec.c | 129 ++++++++++++++++++----------------------------
 tcg/tcg.c         |   8 +++
 3 files changed, 60 insertions(+), 78 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline TCGv_i64 tcg_constant_i64(int64_t val)
 }
 
 TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val);
 
 #if UINTPTR_MAX == UINT32_MAX
 # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
                         gen_helper_gvec_2 *fn)
 {
     TCGv_ptr a0, a1;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with two vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
                          gen_helper_gvec_2i *fn)
 {
     TCGv_ptr a0, a1;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with three vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         gen_helper_gvec_3 *fn)
 {
     TCGv_ptr a0, a1, a2;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with four vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         int32_t data, gen_helper_gvec_4 *fn)
 {
     TCGv_ptr a0, a1, a2, a3;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with five vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 {
     TCGv_ptr a0, a1, a2, a3, a4;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
     tcg_temp_free_ptr(a4);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
                         int32_t data, gen_helper_gvec_2_ptr *fn)
 {
     TCGv_ptr a0, a1;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         int32_t data, gen_helper_gvec_3_ptr *fn)
 {
     TCGv_ptr a0, a1, a2;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with four vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         gen_helper_gvec_4_ptr *fn)
 {
     TCGv_ptr a0, a1, a2, a3;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with five vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         gen_helper_gvec_5_ptr *fn)
 {
     TCGv_ptr a0, a1, a2, a3, a4;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
     tcg_temp_free_ptr(a4);
-    tcg_temp_free_i32(desc);
 }
 
 /* Return true if we want to implement something of OPRSZ bytes
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
                 || (TCG_TARGET_REG_BITS == 64
                     && (in_c == 0 || in_c == -1
                         || !check_size_impl(oprsz, 4)))) {
-                t_64 = tcg_const_i64(in_c);
+                t_64 = tcg_constant_i64(in_c);
             } else {
-                t_32 = tcg_const_i32(in_c);
+                t_32 = tcg_constant_i32(in_c);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
             t_val = tcg_temp_new_i32();
             tcg_gen_extrl_i64_i32(t_val, in_64);
         } else {
-            t_val = tcg_const_i32(in_c);
+            t_val = tcg_constant_i32(in_c);
         }
         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 
-        if (!in_32) {
+        if (in_64) {
             tcg_temp_free_i32(t_val);
         }
         tcg_temp_free_ptr(t_size);
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
         return;
     }
 
-    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
+    t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 
     if (vece == MO_64) {
         if (in_64) {
             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
         } else {
-            t_64 = tcg_const_i64(in_c);
+            t_64 = tcg_constant_i64(in_c);
             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
-            tcg_temp_free_i64(t_64);
         }
     } else {
         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 
         if (in_32) {
             fns[vece](t_ptr, t_desc, in_32);
-        } else {
+        } else if (in_64) {
             t_32 = tcg_temp_new_i32();
-            if (in_64) {
-                tcg_gen_extrl_i64_i32(t_32, in_64);
-            } else if (vece == MO_8) {
-                tcg_gen_movi_i32(t_32, in_c & 0xff);
-            } else if (vece == MO_16) {
-                tcg_gen_movi_i32(t_32, in_c & 0xffff);
-            } else {
-                tcg_gen_movi_i32(t_32, in_c);
-            }
+            tcg_gen_extrl_i64_i32(t_32, in_64);
             fns[vece](t_ptr, t_desc, t_32);
             tcg_temp_free_i32(t_32);
+        } else {
+            if (vece == MO_8) {
+                in_c &= 0xff;
+            } else if (vece == MO_16) {
+                in_c &= 0xffff;
+            }
+            t_32 = tcg_constant_i32(in_c);
+            fns[vece](t_ptr, t_desc, t_32);
         }
     }
 
     tcg_temp_free_ptr(t_ptr);
-    tcg_temp_free_i32(t_desc);
     return;
 
  done:
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
             if (g->fno) {
                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
             } else {
-                TCGv_i64 tcg_c = tcg_const_i64(c);
+                TCGv_i64 tcg_c = tcg_constant_i64(c);
                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
                                     maxsz, c, g->fnoi);
-                tcg_temp_free_i64(tcg_c);
             }
             oprsz = maxsz;
         }
@@ -XXX,XX +XXX,XX @@ static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
 
 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
     gen_addv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
     gen_addv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(c);
+    TCGv_i64 tmp = tcg_constant_i64(c);
     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
-    tcg_temp_free_i64(tmp);
 }
 
 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
@@ -XXX,XX +XXX,XX @@ static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
 
 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
     gen_subv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
     gen_subv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(c);
+    TCGv_i64 tmp = tcg_constant_i64(c);
     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
-    tcg_temp_free_i64(tmp);
 }
 
 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
 {
-    TCGv_i32 max = tcg_const_i32(-1);
+    TCGv_i32 max = tcg_constant_i32(-1);
     tcg_gen_add_i32(d, a, b);
     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
-    tcg_temp_free_i32(max);
 }
 
 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 max = tcg_const_i64(-1);
+    TCGv_i64 max = tcg_constant_i64(-1);
     tcg_gen_add_i64(d, a, b);
     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
-    tcg_temp_free_i64(max);
 }
 
 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
 {
-    TCGv_i32 min = tcg_const_i32(0);
+    TCGv_i32 min = tcg_constant_i32(0);
     tcg_gen_sub_i32(d, a, b);
     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
-    tcg_temp_free_i32(min);
 }
 
 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 min = tcg_const_i64(0);
+    TCGv_i64 min = tcg_constant_i64(0);
     tcg_gen_sub_i64(d, a, b);
     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
-    tcg_temp_free_i64(min);
 }
 
 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
 
 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
     gen_negv_mask(d, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
     gen_negv_mask(d, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
-    tcg_temp_free_i64(tmp);
 }
 
 static const GVecGen2s gop_xors = {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
-    tcg_temp_free_i64(tmp);
 }
 
 static const GVecGen2s gop_ors = {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
                       int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
-    tcg_temp_free_i64(tmp);
 }
 
 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
                                  TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_shlv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
                                  TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_shrv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
                                  TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_sarv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_rotlv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_rotrv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
     return temp_tcgv_vec(tcg_constant_internal(type, val));
 }
 
+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val)
+{
+    TCGTemp *t = tcgv_vec_temp(match);
+
+    tcg_debug_assert(t->temp_allocated != 0);
+    return tcg_constant_vec(t->base_type, vece, val);
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
     TCGv_i32 t0;
-- 
2.25.1

The normal movi opcodes are going away.  We need something
for TCI to use internally.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h    | 8 ++++++++
 tcg/tci.c                | 4 ++--
 tcg/tci/tcg-target.c.inc | 4 ++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
 #include "tcg-target.opc.h"
 #endif
 
+#ifdef TCG_TARGET_INTERPRETER
+/* These opcodes are only for use between the tci generator and interpreter. */
+DEF(tci_movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
+#if TCG_TARGET_REG_BITS == 64
+DEF(tci_movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
+#endif
+#endif
+
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t1 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg32(regs, t0, t1);
             break;
-        case INDEX_op_movi_i32:
+        case INDEX_op_tci_movi_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_i32(&tb_ptr);
             tci_write_reg32(regs, t0, t1);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
             t1 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg64(regs, t0, t1);
             break;
-        case INDEX_op_movi_i64:
+        case INDEX_op_tci_movi_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_i64(&tb_ptr);
             tci_write_reg64(regs, t0, t1);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     uint8_t *old_code_ptr = s->code_ptr;
     uint32_t arg32 = arg;
     if (type == TCG_TYPE_I32 || arg == arg32) {
-        tcg_out_op_t(s, INDEX_op_movi_i32);
+        tcg_out_op_t(s, INDEX_op_tci_movi_i32);
         tcg_out_r(s, t0);
         tcg_out32(s, arg32);
     } else {
         tcg_debug_assert(type == TCG_TYPE_I64);
 #if TCG_TARGET_REG_BITS == 64
-        tcg_out_op_t(s, INDEX_op_movi_i64);
+        tcg_out_op_t(s, INDEX_op_tci_movi_i64);
         tcg_out_r(s, t0);
         tcg_out64(s, arg);
 #else
-- 
2.25.1

These are now completely covered by mov from a
TYPE_CONST temporary.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h        |  3 ---
 tcg/optimize.c               |  4 ----
 tcg/tcg-op-vec.c             |  1 -
 tcg/tcg.c                    | 18 +-----------------
 tcg/aarch64/tcg-target.c.inc |  3 ---
 tcg/arm/tcg-target.c.inc     |  1 -
 tcg/i386/tcg-target.c.inc    |  3 ---
 tcg/mips/tcg-target.c.inc    |  2 --
 tcg/ppc/tcg-target.c.inc     |  3 ---
 tcg/riscv/tcg-target.c.inc   |  2 --
 tcg/s390/tcg-target.c.inc    |  2 --
 tcg/sparc/tcg-target.c.inc   |  2 --
 tcg/tci/tcg-target.c.inc     |  2 --
 13 files changed, 1 insertion(+), 45 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 DEF(mb, 0, 0, 1, 0)
 
 DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
-DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
 DEF(setcond_i32, 1, 2, 1, 0)
 DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
 /* load/store */
@@ -XXX,XX +XXX,XX @@ DEF(ctz_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_ctz_i32))
 DEF(ctpop_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ctpop_i32))
 
 DEF(mov_i64, 1, 1, 0, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
-DEF(movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
 DEF(setcond_i64, 1, 2, 1, IMPL64)
 DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
 /* load/store */
@@ -XXX,XX +XXX,XX @@ DEF(qemu_st8_i32, 0, TLADDR_ARGS + 1, 1,
 #define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
 
 DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
-DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
 
 DEF(dup_vec, 1, 1, 0, IMPLVEC)
 DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
             break;
-        CASE_OP_32_64(movi):
-        case INDEX_op_dupi_vec:
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
-            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_can_emit_vecop_list(const TCGOpcode *list,
         case INDEX_op_xor_vec:
         case INDEX_op_mov_vec:
         case INDEX_op_dup_vec:
-        case INDEX_op_dupi_vec:
         case INDEX_op_dup2_vec:
         case INDEX_op_ld_vec:
         case INDEX_op_st_vec:
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
         return TCG_TARGET_HAS_goto_ptr;
 
     case INDEX_op_mov_i32:
-    case INDEX_op_movi_i32:
     case INDEX_op_setcond_i32:
     case INDEX_op_brcond_i32:
     case INDEX_op_ld8u_i32:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
         return TCG_TARGET_REG_BITS == 32;
 
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i64:
     case INDEX_op_setcond_i64:
     case INDEX_op_brcond_i64:
     case INDEX_op_ld8u_i64:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
 
     case INDEX_op_mov_vec:
     case INDEX_op_dup_vec:
-    case INDEX_op_dupi_vec:
     case INDEX_op_dupm_vec:
     case INDEX_op_ld_vec:
     case INDEX_op_st_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
 }
 
 /*
- * Specialized code generation for INDEX_op_movi_*.
+ * Specialized code generation for INDEX_op_mov_* with a constant.
  */
 static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                   tcg_target_ulong val, TCGLifeData arg_life,
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
     }
 }
 
-static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
-{
-    TCGTemp *ots = arg_temp(op->args[0]);
-    tcg_target_ulong val = op->args[1];
-
-    tcg_reg_alloc_do_movi(s, ots, val, op->life, op->output_pref[0]);
-}
-
 /*
  * Specialized code generation for INDEX_op_mov_*.
  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         case INDEX_op_mov_vec:
             tcg_reg_alloc_mov(s, op);
             break;
-        case INDEX_op_movi_i32:
-        case INDEX_op_movi_i64:
-        case INDEX_op_dupi_vec:
-            tcg_reg_alloc_movi(s, op);
-            break;
         case INDEX_op_dup_vec:
             tcg_reg_alloc_dup(s, op);
             break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
 
     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32:  /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         return;
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
-- 
2.25.1

There are several ways we can expand a vector dup of a 64-bit
element on a 32-bit host.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
     }
 }
 
+static bool tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
+{
+    const TCGLifeData arg_life = op->life;
+    TCGTemp *ots, *itsl, *itsh;
+    TCGType vtype = TCGOP_VECL(op) + TCG_TYPE_V64;
+
+    /* This opcode is only valid for 32-bit hosts, for 64-bit elements. */
+    tcg_debug_assert(TCG_TARGET_REG_BITS == 32);
+    tcg_debug_assert(TCGOP_VECE(op) == MO_64);
+
+    ots = arg_temp(op->args[0]);
+    itsl = arg_temp(op->args[1]);
+    itsh = arg_temp(op->args[2]);
+
+    /* ENV should not be modified.  */
+    tcg_debug_assert(!temp_readonly(ots));
+
+    /* Allocate the output register now.  */
+    if (ots->val_type != TEMP_VAL_REG) {
+        TCGRegSet allocated_regs = s->reserved_regs;
+        TCGRegSet dup_out_regs =
+            tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
+
+        /* Make sure to not spill the input registers. */
+        if (!IS_DEAD_ARG(1) && itsl->val_type == TEMP_VAL_REG) {
+            tcg_regset_set_reg(allocated_regs, itsl->reg);
+        }
+        if (!IS_DEAD_ARG(2) && itsh->val_type == TEMP_VAL_REG) {
+            tcg_regset_set_reg(allocated_regs, itsh->reg);
+        }
+
+        ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+                                 op->output_pref[0], ots->indirect_base);
+        ots->val_type = TEMP_VAL_REG;
+        ots->mem_coherent = 0;
+        s->reg_to_temp[ots->reg] = ots;
+    }
+
+    /* Promote dup2 of immediates to dupi_vec. */
+    if (itsl->val_type == TEMP_VAL_CONST && itsh->val_type == TEMP_VAL_CONST) {
+        uint64_t val = deposit64(itsl->val, 32, 32, itsh->val);
+        MemOp vece = MO_64;
+
+        if (val == dup_const(MO_8, val)) {
+            vece = MO_8;
+        } else if (val == dup_const(MO_16, val)) {
+            vece = MO_16;
+        } else if (val == dup_const(MO_32, val)) {
+            vece = MO_32;
+        }
+
+        tcg_out_dupi_vec(s, vtype, vece, ots->reg, val);
+        goto done;
+    }
+
+    /* If the two inputs form one 64-bit value, try dupm_vec. */
+    if (itsl + 1 == itsh && itsl->base_type == TCG_TYPE_I64) {
+        if (!itsl->mem_coherent) {
+            temp_sync(s, itsl, s->reserved_regs, 0, 0);
+        }
+        if (!itsh->mem_coherent) {
+            temp_sync(s, itsh, s->reserved_regs, 0, 0);
+        }
+#ifdef HOST_WORDS_BIGENDIAN
+        TCGTemp *its = itsh;
+#else
+        TCGTemp *its = itsl;
+#endif
+        if (tcg_out_dupm_vec(s, vtype, MO_64, ots->reg,
+                             its->mem_base->reg, its->mem_offset)) {
+            goto done;
+        }
+    }
+
+    /* Fall back to generic expansion. */
+    return false;
+
+ done:
+    if (IS_DEAD_ARG(1)) {
+        temp_dead(s, itsl);
+    }
+    if (IS_DEAD_ARG(2)) {
+        temp_dead(s, itsh);
+    }
+    if (NEED_SYNC_ARG(0)) {
+        temp_sync(s, ots, s->reserved_regs, 0, IS_DEAD_ARG(0));
+    } else if (IS_DEAD_ARG(0)) {
+        temp_dead(s, ots);
+    }
+    return true;
+}
+
 #ifdef TCG_TARGET_STACK_GROWSUP
 #define STACK_DIR(x) (-(x))
 #else
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         case INDEX_op_call:
             tcg_reg_alloc_call(s, op);
             break;
+        case INDEX_op_dup2_vec:
+            if (tcg_reg_alloc_dup2(s, op)) {
+                break;
+            }
+            /* fall through */
         default:
             /* Sanity check that we've not introduced any unhandled opcodes. */
             tcg_debug_assert(tcg_op_supported(opc));
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
 static void expand_vec_mul(TCGType type, unsigned vece,
                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
 {
-    TCGv_vec t1, t2, t3, t4;
+    TCGv_vec t1, t2, t3, t4, zero;
 
     tcg_debug_assert(vece == MO_8);
 
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
     case TCG_TYPE_V64:
         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
-        tcg_gen_dup16i_vec(t2, 0);
+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
-                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
-                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
         tcg_gen_mul_vec(MO_16, t1, t1, t2);
         tcg_gen_shri_vec(MO_16, t1, t1, 8);
         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
         t2 = tcg_temp_new_vec(type);
         t3 = tcg_temp_new_vec(type);
         t4 = tcg_temp_new_vec(type);
-        tcg_gen_dup16i_vec(t4, 0);
+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
-                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
-                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
-                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
-                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
         tcg_gen_mul_vec(MO_16, t1, t1, t2);
         tcg_gen_mul_vec(MO_16, t3, t3, t4);
         tcg_gen_shri_vec(MO_16, t1, t1, 8);
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
         NEED_UMIN = 8,
         NEED_UMAX = 16,
     };
-    TCGv_vec t1, t2;
+    TCGv_vec t1, t2, t3;
     uint8_t fixup;
 
     switch (cond) {
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
     } else if (fixup & NEED_BIAS) {
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
-        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
-        tcg_gen_sub_vec(vece, t1, v1, t2);
-        tcg_gen_sub_vec(vece, t2, v2, t2);
+        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
+        tcg_gen_sub_vec(vece, t1, v1, t3);
+        tcg_gen_sub_vec(vece, t2, v2, t3);
         v1 = t1;
         v2 = t2;
         cond = tcg_signed_cond(cond);
-- 
2.25.1

These interfaces have been replaced by tcg_gen_dupi_vec
and tcg_constant_vec.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h |  4 ----
 tcg/tcg-op-vec.c     | 20 --------------------
 2 files changed, 24 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
 void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
 void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
-void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
-void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
-void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
-void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
 void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
     return tcg_const_ones_vec(t->base_type);
 }
 
-void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
-{
-    tcg_gen_dupi_vec(MO_64, r, a);
-}
-
-void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
-{
-    tcg_gen_dupi_vec(MO_32, r, a);
-}
-
-void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
-{
-    tcg_gen_dupi_vec(MO_16, r, a);
-}
-
-void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
-{
-    tcg_gen_dupi_vec(MO_8, r, a);
-}
-
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
 {
     TCGTemp *rt = tcgv_vec_temp(r);
-- 
2.25.1

Improve expand_vec_shi to use sign-extraction for MO_32.
This allows a single VSPLTISB instruction to load all of
the valid shift constants.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 44 ++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 static void expand_vec_shi(TCGType type, unsigned vece, TCGv_vec v0,
                            TCGv_vec v1, TCGArg imm, TCGOpcode opci)
 {
-    TCGv_vec t1 = tcg_temp_new_vec(type);
+    TCGv_vec t1;
 
-    /* Splat w/bytes for xxspltib.  */
-    tcg_gen_dupi_vec(MO_8, t1, imm & ((8 << vece) - 1));
+    if (vece == MO_32) {
+        /*
+         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
+         * So using negative numbers gets us the 4th bit easily.
+         */
+        imm = sextract32(imm, 0, 5);
+    } else {
+        imm &= (8 << vece) - 1;
+    }
+
+    /* Splat w/bytes for xxspltib when 2.07 allows MO_64. */
+    t1 = tcg_constant_vec(type, MO_8, imm);
     vec_gen_3(opci, type, vece, tcgv_vec_arg(v0),
               tcgv_vec_arg(v1), tcgv_vec_arg(t1));
-    tcg_temp_free_vec(t1);
 }
 
 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
 {
     TCGv_vec t1 = tcg_temp_new_vec(type);
     TCGv_vec t2 = tcg_temp_new_vec(type);
-    TCGv_vec t3, t4;
+    TCGv_vec c0, c16;
 
     switch (vece) {
     case MO_8:
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
 
     case MO_32:
         tcg_debug_assert(!have_isa_2_07);
-        t3 = tcg_temp_new_vec(type);
-        t4 = tcg_temp_new_vec(type);
-        tcg_gen_dupi_vec(MO_8, t4, -16);
+        /*
+         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
+         * So using -16 is a quick way to represent 16.
+         */
+        c16 = tcg_constant_vec(type, MO_8, -16);
+        c0 = tcg_constant_vec(type, MO_8, 0);
+
         vec_gen_3(INDEX_op_rotlv_vec, type, MO_32, tcgv_vec_arg(t1),
-                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
+                  tcgv_vec_arg(v2), tcgv_vec_arg(c16));
         vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
                   tcgv_vec_arg(v1), tcgv_vec_arg(v2));
-        tcg_gen_dupi_vec(MO_8, t3, 0);
-        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
-                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
-        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
-                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
-        tcg_gen_add_vec(MO_32, v0, t2, t3);
-        tcg_temp_free_vec(t3);
-        tcg_temp_free_vec(t4);
+        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(c0));
+        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(t1), tcgv_vec_arg(c16));
+        tcg_gen_add_vec(MO_32, v0, t1, t2);
         break;
 
     default:
-- 
2.25.1

Improve rotrv_vec to reduce "t1 = -v2, t2 = t1 + c" to
"t1 = -v2, t2 = c - v2".  This avoids a serial dependency
between t1 and t2.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
     va_list va;
-    TCGv_vec v0, v1, v2, t1, t2;
+    TCGv_vec v0, v1, v2, t1, t2, c1;
     TCGArg a2;
 
     va_start(va, a0);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 
     case INDEX_op_rotlv_vec:
         t1 = tcg_temp_new_vec(type);
-        tcg_gen_dupi_vec(vece, t1, 8 << vece);
-        tcg_gen_sub_vec(vece, t1, v2, t1);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_sub_vec(vece, t1, v2, c1);
         /* Right shifts are negative left shifts for AArch64.  */
         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     case INDEX_op_rotrv_vec:
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
         tcg_gen_neg_vec(vece, t1, v2);
-        tcg_gen_dupi_vec(vece, t2, 8 << vece);
-        tcg_gen_add_vec(vece, t2, t1, t2);
+        tcg_gen_sub_vec(vece, t2, c1, v2);
         /* Right shifts are negative left shifts for AArch64.  */
         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
-- 
2.25.1

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

When decodetree.py was added in commit 568ae7efae7, QEMU was
using Python 2 which happily reads UTF-8 files in text mode.
Python 3 requires either UTF-8 locale or an explicit encoding
passed to open(). Now that Python 3 is required, explicit
UTF-8 encoding for decodetree source files.

To avoid further problems with the user locale, also explicit
UTF-8 encoding for the generated C files.

Explicit both input/output are plain text by using the 't' mode.

This fixes:

$ /usr/bin/python3 scripts/decodetree.py test.decode
  Traceback (most recent call last):
    File "scripts/decodetree.py", line 1397, in <module>
      main()
    File "scripts/decodetree.py", line 1308, in main
      parse_file(f, toppat)
    File "scripts/decodetree.py", line 994, in parse_file
      for line in f:
    File "/usr/lib/python3.6/encodings/ascii.py", line 26, in decode
      return codecs.ascii_decode(input, self.errors)[0]
  UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 80:
  ordinal not in range(128)

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Suggested-by: Yonggang Luo <luoyonggang@gmail.com>
Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Message-Id: <20210110000240.761122-1-f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 scripts/decodetree.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/decodetree.py b/scripts/decodetree.py
index XXXXXXX..XXXXXXX 100644
--- a/scripts/decodetree.py
+++ b/scripts/decodetree.py
@@ -XXX,XX +XXX,XX @@
 # See the syntax and semantics in docs/devel/decodetree.rst.
 #
 
+import io
 import os
 import re
 import sys
@@ -XXX,XX +XXX,XX @@ def main():
 
     for filename in args:
         input_file = filename
-        f = open(filename, 'r')
+        f = open(filename, 'rt', encoding='utf-8')
         parse_file(f, toppat)
         f.close()
 
@@ -XXX,XX +XXX,XX @@ def main():
         prop_size(stree)
 
     if output_file:
-        output_fd = open(output_file, 'w')
+        output_fd = open(output_file, 'wt', encoding='utf-8')
     else:
-        output_fd = sys.stdout
+        output_fd = io.TextIOWrapper(sys.stdout.buffer,
+                                     encoding=sys.stdout.encoding,
+                                     errors="ignore")
 
     output_autogen()
     for n in sorted(arguments.keys()):
-- 
2.25.1

The following changes since commit aa3a285b5bc56a4208b3b57d4a55291e9c260107:

Merge tag 'mem-2024-12-21' of https://github.com/davidhildenbrand/qemu into staging (2024-12-22 14:33:27 -0500)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20241224

for you to fetch changes up to e4a8e093dc74be049f4829831dce76e5edab0003:

accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core (2024-12-24 08:32:15 -0800)

----------------------------------------------------------------
tcg/optimize: Remove in-flight mask data from OptContext
fpu: Add float*_muladd_scalbn
fpu: Remove float_muladd_halve_result
fpu: Add float_round_nearest_even_max
fpu: Add float_muladd_suppress_add_product_zero
target/hexagon: Use float32_muladd
accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core

----------------------------------------------------------------
Ilya Leoshkevich (1):
      tests/tcg: Do not use inttypes.h in multiarch/system/memory.c

Pierrick Bouvier (1):
      plugins: optimize cpu_index code generation

Richard Henderson (70):
      tcg/optimize: Split out finish_bb, finish_ebb
      tcg/optimize: Split out fold_affected_mask
      tcg/optimize: Copy mask writeback to fold_masks
      tcg/optimize: Split out fold_masks_zs
      tcg/optimize: Augment s_mask from z_mask in fold_masks_zs
      tcg/optimize: Change representation of s_mask
      tcg/optimize: Use finish_folding in fold_add, fold_add_vec, fold_addsub2
      tcg/optimize: Introduce const value accessors for TempOptInfo
      tcg/optimize: Use fold_masks_zs in fold_and
      tcg/optimize: Use fold_masks_zs in fold_andc
      tcg/optimize: Use fold_masks_zs in fold_bswap
      tcg/optimize: Use fold_masks_zs in fold_count_zeros
      tcg/optimize: Use fold_masks_z in fold_ctpop
      tcg/optimize: Use fold_and and fold_masks_z in fold_deposit
      tcg/optimize: Compute sign mask in fold_deposit
      tcg/optimize: Use finish_folding in fold_divide
      tcg/optimize: Use finish_folding in fold_dup, fold_dup2
      tcg/optimize: Use fold_masks_s in fold_eqv
      tcg/optimize: Use fold_masks_z in fold_extract
      tcg/optimize: Use finish_folding in fold_extract2
      tcg/optimize: Use fold_masks_zs in fold_exts
      tcg/optimize: Use fold_masks_z in fold_extu
      tcg/optimize: Use fold_masks_zs in fold_movcond
      tcg/optimize: Use finish_folding in fold_mul*
      tcg/optimize: Use fold_masks_s in fold_nand
      tcg/optimize: Use fold_masks_z in fold_neg_no_const
      tcg/optimize: Use fold_masks_s in fold_nor
      tcg/optimize: Use fold_masks_s in fold_not
      tcg/optimize: Use fold_masks_zs in fold_or
      tcg/optimize: Use fold_masks_zs in fold_orc
      tcg/optimize: Use fold_masks_zs in fold_qemu_ld
      tcg/optimize: Return true from fold_qemu_st, fold_tcg_st
      tcg/optimize: Use finish_folding in fold_remainder
      tcg/optimize: Distinguish simplification in fold_setcond_zmask
      tcg/optimize: Use fold_masks_z in fold_setcond
      tcg/optimize: Use fold_masks_s in fold_negsetcond
      tcg/optimize: Use fold_masks_z in fold_setcond2
      tcg/optimize: Use finish_folding in fold_cmp_vec
      tcg/optimize: Use finish_folding in fold_cmpsel_vec
      tcg/optimize: Use fold_masks_zs in fold_sextract
      tcg/optimize: Use fold_masks_zs, fold_masks_s in fold_shift
      tcg/optimize: Simplify sign bit test in fold_shift
      tcg/optimize: Use finish_folding in fold_sub, fold_sub_vec
      tcg/optimize: Use fold_masks_zs in fold_tcg_ld
      tcg/optimize: Use finish_folding in fold_tcg_ld_memcopy
      tcg/optimize: Use fold_masks_zs in fold_xor
      tcg/optimize: Use finish_folding in fold_bitsel_vec
      tcg/optimize: Use finish_folding as default in tcg_optimize
      tcg/optimize: Remove z_mask, s_mask from OptContext
      tcg/optimize: Re-enable sign-mask optimizations
      tcg/optimize: Move fold_bitsel_vec into alphabetic sort
      tcg/optimize: Move fold_cmp_vec, fold_cmpsel_vec into alphabetic sort
      softfloat: Add float{16,32,64}_muladd_scalbn
      target/arm: Use float*_muladd_scalbn
      target/sparc: Use float*_muladd_scalbn
      softfloat: Remove float_muladd_halve_result
      softfloat: Add float_round_nearest_even_max
      softfloat: Add float_muladd_suppress_add_product_zero
      target/hexagon: Use float32_mul in helper_sfmpy
      target/hexagon: Use float32_muladd for helper_sffma
      target/hexagon: Use float32_muladd for helper_sffms
      target/hexagon: Use float32_muladd_scalbn for helper_sffma_sc
      target/hexagon: Use float32_muladd for helper_sffm[as]_lib
      target/hexagon: Remove internal_fmafx
      target/hexagon: Expand GEN_XF_ROUND
      target/hexagon: Remove Float
      target/hexagon: Remove Double
      target/hexagon: Use mulu64 for int128_mul_6464
      target/hexagon: Simplify internal_mpyhh setup
      accel/tcg: Move gen_intermediate_code to TCGCPUOps.translate_core

From: Ilya Leoshkevich <iii@linux.ibm.com>

make check-tcg fails on Fedora with the following error message:

alpha-linux-gnu-gcc [...] qemu/tests/tcg/multiarch/system/memory.c -o memory [...]
    qemu/tests/tcg/multiarch/system/memory.c:17:10: fatal error: inttypes.h: No such file or directory
       17 | #include <inttypes.h>
          |          ^~~~~~~~~~~~
    compilation terminated.

The reason is that Fedora has cross-compilers, but no cross-glibc
headers. Fix by hardcoding the format specifiers and dropping the
include.

An alternative fix would be to introduce a configure check for
inttypes.h. But this would make it impossible to use Fedora
cross-compilers for softmmu tests, which used to work so far.

Fixes: ecbcc9ead2f8 ("tests/tcg: add a system test to check memory instrumentation")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <20241010085906.226249-1-iii@linux.ibm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/multiarch/system/memory.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/tcg/multiarch/system/memory.c b/tests/tcg/multiarch/system/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/tests/tcg/multiarch/system/memory.c
+++ b/tests/tcg/multiarch/system/memory.c
@@ -XXX,XX +XXX,XX @@
 
 #include <stdint.h>
 #include <stdbool.h>
-#include <inttypes.h>
 #include <minilib.h>
 
 #ifndef CHECK_UNALIGNED
@@ -XXX,XX +XXX,XX @@ int main(void)
     int i;
     bool ok = true;
 
-    ml_printf("Test data start: 0x%"PRIxPTR"\n", &test_data[0]);
-    ml_printf("Test data end: 0x%"PRIxPTR"\n", &test_data[TEST_SIZE]);
+    ml_printf("Test data start: 0x%lx\n", (unsigned long)&test_data[0]);
+    ml_printf("Test data end: 0x%lx\n", (unsigned long)&test_data[TEST_SIZE]);
 
     /* Run through the unsigned tests first */
     for (i = 0; i < ARRAY_SIZE(init_ufns) && ok; i++) {
@@ -XXX,XX +XXX,XX @@ int main(void)
         ok = do_signed_reads(true);
     }
 
-    ml_printf("Test data read: %"PRId32"\n", test_read_count);
-    ml_printf("Test data write: %"PRId32"\n", test_write_count);
+    ml_printf("Test data read: %lu\n", (unsigned long)test_read_count);
+    ml_printf("Test data write: %lu\n", (unsigned long)test_write_count);
     ml_printf("Test complete: %s\n", ok ? "PASSED" : "FAILED");
     return ok ? 0 : -1;
 }
-- 
2.43.0

From: Pierrick Bouvier <pierrick.bouvier@linaro.org>

When running with a single vcpu, we can return a constant instead of a
load when accessing cpu_index.
A side effect is that all tcg operations using it are optimized, most
notably scoreboard access.
When running a simple loop in user-mode, the speedup is around 20%.

Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-ID: <20241128213843.1023080-1-pierrick.bouvier@linaro.org>
---
 accel/tcg/plugin-gen.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static void gen_disable_mem_helper(void)
 
 static TCGv_i32 gen_cpu_index(void)
 {
+    /*
+     * Optimize when we run with a single vcpu. All values using cpu_index,
+     * including scoreboard index, will be optimized out.
+     * User-mode calls tb_flush when setting this flag. In system-mode, all
+     * vcpus are created before generating code.
+     */
+    if (!tcg_cflags_has(current_cpu, CF_PARALLEL)) {
+        return tcg_constant_i32(current_cpu->cpu_index);
+    }
     TCGv_i32 cpu_index = tcg_temp_ebb_new_i32();
     tcg_gen_ld_i32(cpu_index, tcg_env,
                    -offsetof(ArchCPU, env) + offsetof(CPUState, cpu_index));
-- 
2.43.0

Call them directly from the opcode switch statement in tcg_optimize,
rather than in finish_folding based on opcode flags.  Adjust folding
of conditional branches to match.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_bb(OptContext *ctx)
+{
+    /* We only optimize memory barriers across basic blocks. */
+    ctx->prev_mb = NULL;
+}
+
+static void finish_ebb(OptContext *ctx)
+{
+    finish_bb(ctx);
+    /* We only optimize across extended basic blocks. */
+    memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+    remove_mem_copy_all(ctx);
+}
+
 static void finish_folding(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     int i, nb_oargs;
 
-    /*
-     * We only optimize extended basic blocks.  If the opcode ends a BB
-     * and is not a conditional branch, reset all temp data.
-     */
-    if (def->flags & TCG_OPF_BB_END) {
-        ctx->prev_mb = NULL;
-        if (!(def->flags & TCG_OPF_COND_BRANCH)) {
-            memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
-            remove_mem_copy_all(ctx);
-        }
-        return;
-    }
-
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
         TCGTemp *ts = arg_temp(op->args[i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
     if (i > 0) {
         op->opc = INDEX_op_br;
         op->args[0] = op->args[3];
+        finish_ebb(ctx);
+    } else {
+        finish_bb(ctx);
     }
-    return false;
+    return true;
 }
 
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
         }
         op->opc = INDEX_op_br;
         op->args[0] = label;
-        break;
+        finish_ebb(ctx);
+        return true;
     }
-    return false;
+
+    finish_bb(ctx);
+    return true;
 }
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        case INDEX_op_set_label:
+        case INDEX_op_br:
+        case INDEX_op_exit_tb:
+        case INDEX_op_goto_tb:
+        case INDEX_op_goto_ptr:
+            finish_ebb(&ctx);
+            done = true;
+            break;
         default:
             break;
         }
-- 
2.43.0

There are only a few logical operations which can compute
an "affected" mask.  Split out handling of this optimization
to a separate function, only to be called when applicable.

Remove the a_mask field from OptContext, as the mask is
no longer stored anywhere.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     QSIMPLEQ_HEAD(, MemCopyInfo) mem_free;
 
     /* In flight values from optimization. */
-    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
-    uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
     uint64_t s_mask = ctx->s_mask;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
         s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (z_mask == 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
     }
+    return false;
+}
+
+/*
+ * An "affected" mask bit is 0 if and only if the result is identical
+ * to the first input.  Thus if the entire mask is 0, the operation
+ * is equivalent to a copy.
+ */
+static bool fold_affected_mask(OptContext *ctx, TCGOp *op, uint64_t a_mask)
+{
+    if (ctx->type == TCG_TYPE_I32) {
+        a_mask = (uint32_t)a_mask;
+    }
     if (a_mask == 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
      */
-    if (arg_is_const(op->args[2])) {
-        ctx->a_mask = z1 & ~z2;
+    if (arg_is_const(op->args[2]) &&
+        fold_affected_mask(ctx, op, z1 & ~z2)) {
+        return true;
     }
 
     return fold_masks(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
      */
     if (arg_is_const(op->args[2])) {
         uint64_t z2 = ~arg_info(op->args[2])->z_mask;
-        ctx->a_mask = z1 & ~z2;
+        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
+            return true;
+        }
         z1 &= z2;
     }
     ctx->z_mask = z1;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
 
     z_mask_old = arg_info(op->args[1])->z_mask;
     z_mask = extract64(z_mask_old, pos, len);
-    if (pos == 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
+    if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+        return true;
     }
     ctx->z_mask = z_mask;
     ctx->s_mask = smask_from_zmask(z_mask);
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = z_mask;
     ctx->s_mask = s_mask;
-    if (!type_change) {
-        ctx->a_mask = s_mask & ~s_mask_old;
+    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+        return true;
     }
 
     return fold_masks(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = z_mask;
     ctx->s_mask = smask_from_zmask(z_mask);
-    if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
+    if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
+        return true;
     }
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
     ctx->s_mask = s_mask;
 
-    if (pos == 0) {
-        ctx->a_mask = s_mask & ~s_mask_old;
+    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+        return true;
     }
 
     return fold_masks(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Assume all bits affected, no bits known zero, no sign reps. */
-        ctx.a_mask = -1;
         ctx.z_mask = -1;
         ctx.s_mask = 0;
 
-- 
2.43.0

Use of fold_masks should be restricted to those opcodes that
can reliably make use of it -- those with a single output,
and from higher-level folders that set up the masks.
Prepare for conversion of each folder in turn.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask = ctx->z_mask;
     uint64_t s_mask = ctx->s_mask;
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    TCGTemp *ts;
+    TempOptInfo *ti;
+
+    /* Only single-output opcodes are supported here. */
+    tcg_debug_assert(def->nb_oargs == 1);
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         z_mask = (int32_t)z_mask;
         s_mask |= MAKE_64BIT_MASK(32, 32);
-        ctx->z_mask = z_mask;
-        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
     }
-    return false;
+
+    ts = arg_temp(op->args[0]);
+    reset_ts(ctx, ts);
+
+    ti = ts_info(ts);
+    ti->z_mask = z_mask;
+    ti->s_mask = s_mask;
+    return true;
 }
 
 /*
-- 
2.43.0

Add a routine to which masks can be passed directly, rather than
storing them into OptContext.  To be used in upcoming patches.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
-static bool fold_masks(OptContext *ctx, TCGOp *op)
+/*
+ * Record "zero" and "sign" masks for the single output of @op.
+ * See TempOptInfo definition of z_mask and s_mask.
+ * If z_mask allows, fold the output to constant zero.
+ */
+static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
+                          uint64_t z_mask, uint64_t s_mask)
 {
-    uint64_t z_mask = ctx->z_mask;
-    uint64_t s_mask = ctx->s_mask;
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGTemp *ts;
     TempOptInfo *ti;
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
+}
+
 /*
  * An "affected" mask bit is 0 if and only if the result is identical
  * to the first input.  Thus if the entire mask is 0, the operation
-- 
2.43.0

Consider the passed s_mask to be a minimum deduced from
either existing s_mask or from a sign-extension operation.
We may be able to deduce more from the set of known zeros.
Remove identical logic from several opcode folders.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
  * Record "zero" and "sign" masks for the single output of @op.
  * See TempOptInfo definition of z_mask and s_mask.
  * If z_mask allows, fold the output to constant zero.
+ * The passed s_mask may be augmented by z_mask.
  */
 static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
                           uint64_t z_mask, uint64_t s_mask)
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
 
     ti = ts_info(ts);
     ti->z_mask = z_mask;
-    ti->s_mask = s_mask;
+    ti->s_mask = s_mask | smask_from_zmask(z_mask);
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
-    s_mask = smask_from_zmask(z_mask);
 
+    s_mask = 0;
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
         break;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
-        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         g_assert_not_reached();
     }
     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
-    ctx->s_mask = smask_from_zmask(ctx->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         return true;
     }
     ctx->z_mask = z_mask;
-    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
-    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     int width = 8 * memop_size(mop);
 
     if (width < 64) {
-        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
-        if (!(mop & MO_SIGN)) {
+        if (mop & MO_SIGN) {
+            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        } else {
             ctx->z_mask = MAKE_64BIT_MASK(0, width);
-            ctx->s_mask <<= 1;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     fold_setcond_tst_pow2(ctx, op, false);
 
     ctx->z_mask = 1;
-    ctx->s_mask = smask_from_zmask(1);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = 1;
-    ctx->s_mask = smask_from_zmask(1);
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
         break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
-        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
         break;
     CASE_OP_32_64(ld16s):
         ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
-        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
         break;
     case INDEX_op_ld32s_i64:
         ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
-        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
-- 
2.43.0

Change the representation from sign bit repetitions to all bits equal
to the sign bit, including the sign bit itself.

The previous format has a problem in that it is difficult to recreate
a valid sign mask after a shift operation: the "repetitions" part of
the previous format meant that applying the same shift as for the value
lead to an off-by-one value.

The new format, including the sign bit itself, means that the sign mask
can be manipulated in exactly the same way as the value, canonicalization
is easier.

Canonicalize the s_mask in fold_masks_zs, rather than requiring callers
to do so.  Treat 0 as a non-canonical but typeless input for no sign
information, which will be reset as appropriate for the data type.
We can easily fold in the data from z_mask while canonicalizing.

Temporarily disable optimizations using s_mask while each operation is
converted to use fold_masks_zs and to the new form.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 64 ++++++++++++--------------------------------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     QSIMPLEQ_HEAD(, MemCopyInfo) mem_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
-    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
+    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-    uint64_t s_mask;  /* mask of clrsb(value) bits */
+    uint64_t s_mask;  /* mask bit is 1 if value bit matches msb */
     TCGType type;
 } OptContext;
 
-/* Calculate the smask for a specific value. */
-static uint64_t smask_from_value(uint64_t value)
-{
-    int rep = clrsb64(value);
-    return ~(~0ull >> rep);
-}
-
-/*
- * Calculate the smask for a given set of known-zeros.
- * If there are lots of zeros on the left, we can consider the remainder
- * an unsigned field, and thus the corresponding signed field is one bit
- * larger.
- */
-static uint64_t smask_from_zmask(uint64_t zmask)
-{
-    /*
-     * Only the 0 bits are significant for zmask, thus the msb itself
-     * must be zero, else we have no sign information.
-     */
-    int rep = clz64(zmask);
-    if (rep == 0) {
-        return 0;
-    }
-    rep -= 1;
-    return ~(~0ull >> rep);
-}
-
-/*
- * Recreate a properly left-aligned smask after manipulation.
- * Some bit-shuffling, particularly shifts and rotates, may
- * retain sign bits on the left, but may scatter disconnected
- * sign bits on the right.  Retain only what remains to the left.
- */
-static uint64_t smask_from_smask(int64_t smask)
-{
-    /* Only the 1 bits are significant for smask */
-    return smask_from_zmask(~smask);
-}
-
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        ti->s_mask = smask_from_value(ts->val);
+        ti->s_mask = INT64_MIN >> clrsb64(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
          */
         if (i == 0) {
             ts_info(ts)->z_mask = ctx->z_mask;
-            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
  * The passed s_mask may be augmented by z_mask.
  */
 static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
-                          uint64_t z_mask, uint64_t s_mask)
+                          uint64_t z_mask, int64_t s_mask)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGTemp *ts;
     TempOptInfo *ti;
+    int rep;
 
     /* Only single-output opcodes are supported here. */
     tcg_debug_assert(def->nb_oargs == 1);
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
      */
     if (ctx->type == TCG_TYPE_I32) {
         z_mask = (int32_t)z_mask;
-        s_mask |= MAKE_64BIT_MASK(32, 32);
+        s_mask |= INT32_MIN;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
 
     ti = ts_info(ts);
     ti->z_mask = z_mask;
-    ti->s_mask = s_mask | smask_from_zmask(z_mask);
+
+    /* Canonicalize s_mask and incorporate data from z_mask. */
+    rep = clz64(~s_mask);
+    rep = MAX(rep, clz64(z_mask));
+    rep = MAX(rep - 1, 0);
+    ti->s_mask = INT64_MIN >> rep;
+
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = z_mask;
     ctx->s_mask = s_mask;
-    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
     s_mask |= MAKE_64BIT_MASK(len, 64 - len);
     ctx->s_mask = s_mask;
 
-    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 
         s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
-        ctx->s_mask = smask_from_smask(s_mask);
 
         return fold_masks(ctx, op);
     }
-- 
2.43.0

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_ebb(OptContext *ctx)
     remove_mem_copy_all(ctx);
 }
 
-static void finish_folding(OptContext *ctx, TCGOp *op)
+static bool finish_folding(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     int i, nb_oargs;
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
             ts_info(ts)->z_mask = ctx->z_mask;
         }
     }
+    return true;
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 /* We cannot as yet do_constant_folding with vectors. */
@@ -XXX,XX +XXX,XX @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
         op->args[4] = arg_new_constant(ctx, bl);
         op->args[5] = arg_new_constant(ctx, bh);
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Introduce ti_is_const, ti_const_val, ti_is_const_val.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static inline TempOptInfo *arg_info(TCGArg arg)
     return ts_info(arg_temp(arg));
 }
 
+static inline bool ti_is_const(TempOptInfo *ti)
+{
+    return ti->is_const;
+}
+
+static inline uint64_t ti_const_val(TempOptInfo *ti)
+{
+    return ti->val;
+}
+
+static inline bool ti_is_const_val(TempOptInfo *ti, uint64_t val)
+{
+    return ti_is_const(ti) && ti_const_val(ti) == val;
+}
+
 static inline bool ts_is_const(TCGTemp *ts)
 {
-    return ts_info(ts)->is_const;
+    return ti_is_const(ts_info(ts));
 }
 
 static inline bool ts_is_const_val(TCGTemp *ts, uint64_t val)
 {
-    TempOptInfo *ti = ts_info(ts);
-    return ti->is_const && ti->val == val;
+    return ti_is_const_val(ts_info(ts), val);
 }
 
 static inline bool arg_is_const(TCGArg arg)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Sink mask computation below fold_affected_mask early exit.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add2(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z1, z2;
+    uint64_t z1, z2, z_mask, s_mask;
+    TempOptInfo *t1, *t2;
 
     if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    z1 = arg_info(op->args[1])->z_mask;
-    z2 = arg_info(op->args[2])->z_mask;
-    ctx->z_mask = z1 & z2;
-
-    /*
-     * Sign repetitions are perforce all identical, whether they are 1 or 0.
-     * Bitwise operations preserve the relative quantity of the repetitions.
-     */
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z1 = t1->z_mask;
+    z2 = t2->z_mask;
 
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
      */
-    if (arg_is_const(op->args[2]) &&
-        fold_affected_mask(ctx, op, z1 & ~z2)) {
+    if (ti_is_const(t2) && fold_affected_mask(ctx, op, z1 & ~z2)) {
         return true;
     }
 
-    return fold_masks(ctx, op);
+    z_mask = z1 & z2;
+
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    s_mask = t1->s_mask & t2->s_mask;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Avoid double inversion of the value of second const operand.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z1;
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1, *t2;
 
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    z1 = arg_info(op->args[1])->z_mask;
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z_mask = t1->z_mask;
 
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer anything from it.
      */
-    if (arg_is_const(op->args[2])) {
-        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
-        if (fold_affected_mask(ctx, op, z1 & ~z2)) {
+    if (ti_is_const(t2)) {
+        uint64_t v2 = ti_const_val(t2);
+        if (fold_affected_mask(ctx, op, z_mask & v2)) {
             return true;
         }
-        z1 &= z2;
+        z_mask &= ~v2;
     }
-    ctx->z_mask = z1;
 
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return fold_masks(ctx, op);
+    s_mask = t1->s_mask & t2->s_mask;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Always set s_mask along the BSWAP_OS path, since the result is
being explicitly sign-extended.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask, sign;
+    TempOptInfo *t1 = arg_info(op->args[1]);
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t = arg_info(op->args[1])->val;
-
-        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    if (ti_is_const(t1)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                do_constant_folding(op->opc, ctx->type,
+                                                    ti_const_val(t1),
+                                                    op->args[2]));
     }
 
-    z_mask = arg_info(op->args[1])->z_mask;
-
+    z_mask = t1->z_mask;
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
-            s_mask = sign << 1;
         }
+        /* The value and therefore s_mask is explicitly sign-extended. */
+        s_mask = sign;
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
         break;
     }
-    ctx->z_mask = z_mask;
-    ctx->s_mask = s_mask;
 
-    return fold_masks(ctx, op);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots. Find TempOptInfo once.
Compute s_mask from the union of the maximum count and the
op2 fallback for op1 being zero.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask;
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1 = arg_info(op->args[1]);
+    TempOptInfo *t2 = arg_info(op->args[2]);
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t = arg_info(op->args[1])->val;
+    if (ti_is_const(t1)) {
+        uint64_t t = ti_const_val(t1);
 
         if (t != 0) {
             t = do_constant_folding(op->opc, ctx->type, t, 0);
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
-    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
-    return false;
+    s_mask = ~z_mask;
+    z_mask |= t2->z_mask;
+    s_mask &= t2->s_mask;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Add fold_masks_z as a trivial wrapper around fold_masks_zs.
Avoid the use of the OptContext slots.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_zs(OptContext *ctx, TCGOp *op,
     return true;
 }
 
+static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
+{
+    return fold_masks_zs(ctx, op, z_mask, 0);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
 
     switch (ctx->type) {
     case TCG_TYPE_I32:
-        ctx->z_mask = 32 | 31;
+        z_mask = 32 | 31;
         break;
     case TCG_TYPE_I64:
-        ctx->z_mask = 64 | 63;
+        z_mask = 64 | 63;
         break;
     default:
         g_assert_not_reached();
     }
-    return false;
+    return fold_masks_z(ctx, op, z_mask);
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
When we fold to and, use fold_and.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
 {
+    TempOptInfo *t1 = arg_info(op->args[1]);
+    TempOptInfo *t2 = arg_info(op->args[2]);
+    int ofs = op->args[3];
+    int len = op->args[4];
     TCGOpcode and_opc;
+    uint64_t z_mask;
 
-    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-        uint64_t t1 = arg_info(op->args[1])->val;
-        uint64_t t2 = arg_info(op->args[2])->val;
-
-        t1 = deposit64(t1, op->args[3], op->args[4], t2);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    if (ti_is_const(t1) && ti_is_const(t2)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                deposit64(ti_const_val(t1), ofs, len,
+                                          ti_const_val(t2)));
     }
 
     switch (ctx->type) {
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
     }
 
     /* Inserting a value into zero at offset 0. */
-    if (arg_is_const_val(op->args[1], 0) && op->args[3] == 0) {
-        uint64_t mask = MAKE_64BIT_MASK(0, op->args[4]);
+    if (ti_is_const_val(t1, 0) && ofs == 0) {
+        uint64_t mask = MAKE_64BIT_MASK(0, len);
 
         op->opc = and_opc;
         op->args[1] = op->args[2];
         op->args[2] = arg_new_constant(ctx, mask);
-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
-        return false;
+        return fold_and(ctx, op);
     }
 
     /* Inserting zero into a value. */
-    if (arg_is_const_val(op->args[2], 0)) {
-        uint64_t mask = deposit64(-1, op->args[3], op->args[4], 0);
+    if (ti_is_const_val(t2, 0)) {
+        uint64_t mask = deposit64(-1, ofs, len, 0);
 
         op->opc = and_opc;
         op->args[2] = arg_new_constant(ctx, mask);
-        ctx->z_mask = mask & arg_info(op->args[1])->z_mask;
-        return false;
+        return fold_and(ctx, op);
     }
 
-    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                            op->args[3], op->args[4],
-                            arg_info(op->args[2])->z_mask);
-    return false;
+    z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
+    return fold_masks_z(ctx, op, z_mask);
 }
 
 static bool fold_divide(OptContext *ctx, TCGOp *op)
-- 
2.43.0

The input which overlaps the sign bit of the output can
have its input s_mask propagated to the output s_mask.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
     TempOptInfo *t2 = arg_info(op->args[2]);
     int ofs = op->args[3];
     int len = op->args[4];
+    int width;
     TCGOpcode and_opc;
-    uint64_t z_mask;
+    uint64_t z_mask, s_mask;
 
     if (ti_is_const(t1) && ti_is_const(t2)) {
         return tcg_opt_gen_movi(ctx, op, op->args[0],
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
     switch (ctx->type) {
     case TCG_TYPE_I32:
         and_opc = INDEX_op_and_i32;
+        width = 32;
         break;
     case TCG_TYPE_I64:
         and_opc = INDEX_op_and_i64;
+        width = 64;
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         return fold_and(ctx, op);
     }
 
+    /* The s_mask from the top portion of the deposit is still valid. */
+    if (ofs + len == width) {
+        s_mask = t2->s_mask << ofs;
+    } else {
+        s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len);
+    }
+
     z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask);
-    return fold_masks_z(ctx, op, z_mask);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_divide(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Add fold_masks_s as a trivial wrapper around fold_masks_zs.
Avoid the use of the OptContext slots.

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_z(OptContext *ctx, TCGOp *op, uint64_t z_mask)
     return fold_masks_zs(ctx, op, z_mask, 0);
 }
 
+static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
+{
+    return fold_masks_zs(ctx, op, -1, s_mask);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask;
+
     if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
 
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return false;
+    s_mask = arg_info(op->args[1])->s_mask
+           & arg_info(op->args[2])->s_mask;
+    return fold_masks_s(ctx, op, s_mask);
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    TempOptInfo *t1 = arg_info(op->args[1]);
     int pos = op->args[2];
     int len = op->args[3];
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t;
-
-        t = arg_info(op->args[1])->val;
-        t = extract64(t, pos, len);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    if (ti_is_const(t1)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                extract64(ti_const_val(t1), pos, len));
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask_old = t1->z_mask;
     z_mask = extract64(z_mask_old, pos, len);
     if (pos == 0 && fold_affected_mask(ctx, op, z_mask_old ^ z_mask)) {
         return true;
     }
-    ctx->z_mask = z_mask;
 
-    return fold_masks(ctx, op);
+    return fold_masks_z(ctx, op, z_mask);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Explicitly sign-extend z_mask instead of doing that manually.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t s_mask_old, s_mask, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask;
     bool type_change = false;
+    TempOptInfo *t1;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask = arg_info(op->args[1])->z_mask;
-    s_mask = arg_info(op->args[1])->s_mask;
+    t1 = arg_info(op->args[1]);
+    z_mask = t1->z_mask;
+    s_mask = t1->s_mask;
     s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
-        sign = INT8_MIN;
-        z_mask = (uint8_t)z_mask;
+        s_mask |= INT8_MIN;
+        z_mask = (int8_t)z_mask;
         break;
     CASE_OP_32_64(ext16s):
-        sign = INT16_MIN;
-        z_mask = (uint16_t)z_mask;
+        s_mask |= INT16_MIN;
+        z_mask = (int16_t)z_mask;
         break;
     case INDEX_op_ext_i32_i64:
         type_change = true;
         QEMU_FALLTHROUGH;
     case INDEX_op_ext32s_i64:
-        sign = INT32_MIN;
-        z_mask = (uint32_t)z_mask;
+        s_mask |= INT32_MIN;
+        z_mask = (int32_t)z_mask;
         break;
     default:
         g_assert_not_reached();
     }
 
-    if (z_mask & sign) {
-        z_mask |= sign;
-    }
-    s_mask |= sign << 1;
-
-    ctx->z_mask = z_mask;
-    ctx->s_mask = s_mask;
     if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
-    return fold_masks(ctx, op);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, s_mask;
+    TempOptInfo *tt, *ft;
     int i;
 
     /* If true and false values are the same, eliminate the cmp. */
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
-    ctx->z_mask = arg_info(op->args[3])->z_mask
-                | arg_info(op->args[4])->z_mask;
-    ctx->s_mask = arg_info(op->args[3])->s_mask
-                & arg_info(op->args[4])->s_mask;
+    tt = arg_info(op->args[3]);
+    ft = arg_info(op->args[4]);
+    z_mask = tt->z_mask | ft->z_mask;
+    s_mask = tt->s_mask & ft->s_mask;
 
-    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-        uint64_t tv = arg_info(op->args[3])->val;
-        uint64_t fv = arg_info(op->args[4])->val;
+    if (ti_is_const(tt) && ti_is_const(ft)) {
+        uint64_t tv = ti_const_val(tt);
+        uint64_t fv = ti_const_val(ft);
         TCGOpcode opc, negopc = 0;
         TCGCond cond = op->args[5];
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
             }
         }
     }
-    return false;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 1)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
         tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1, *t2;
+
     if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
 
-    ctx->z_mask = arg_info(op->args[1])->z_mask
-                | arg_info(op->args[2])->z_mask;
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return fold_masks(ctx, op);
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z_mask = t1->z_mask | t2->z_mask;
+    s_mask = t1->s_mask & t2->s_mask;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, -1) ||
         fold_xi_to_x(ctx, op, -1) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return false;
+    s_mask = arg_info(op->args[1])->s_mask
+           & arg_info(op->args[2])->s_mask;
+    return fold_masks_s(ctx, op, s_mask);
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.

Be careful not to call fold_masks_zs when the memory operation
is wide enough to require multiple outputs, so split into two
functions: fold_qemu_ld_1reg and fold_qemu_ld_2reg.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
     return fold_masks_s(ctx, op, s_mask);
 }
 
-static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+static bool fold_qemu_ld_1reg(OptContext *ctx, TCGOp *op)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
+    uint64_t z_mask = -1, s_mask = 0;
 
     if (width < 64) {
         if (mop & MO_SIGN) {
-            ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+            s_mask = MAKE_64BIT_MASK(width - 1, 64 - (width - 1));
         } else {
-            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            z_mask = MAKE_64BIT_MASK(0, width);
         }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
-    return false;
+
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
+}
+
+static bool fold_qemu_ld_2reg(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         case INDEX_op_qemu_ld_a32_i32:
         case INDEX_op_qemu_ld_a64_i32:
+            done = fold_qemu_ld_1reg(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_a32_i64:
         case INDEX_op_qemu_ld_a64_i64:
+            if (TCG_TARGET_REG_BITS == 64) {
+                done = fold_qemu_ld_1reg(&ctx, op);
+                break;
+            }
+            QEMU_FALLTHROUGH;
         case INDEX_op_qemu_ld_a32_i128:
         case INDEX_op_qemu_ld_a64_i128:
-            done = fold_qemu_ld(&ctx, op);
+            done = fold_qemu_ld_2reg(&ctx, op);
             break;
         case INDEX_op_qemu_st8_a32_i32:
         case INDEX_op_qemu_st8_a64_i32:
-- 
2.43.0

Stores have no output operands, and so need no further work.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
-    return false;
+    return true;
 }
 
 static bool fold_remainder(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
 
     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
         remove_mem_copy_all(ctx);
-        return false;
+        return true;
     }
 
     switch (op->opc) {
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st(OptContext *ctx, TCGOp *op)
         g_assert_not_reached();
     }
     remove_mem_copy_in(ctx, ofs, ofs + lm1);
-    return false;
+    return true;
 }
 
 static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
     TCGType type;
 
     if (op->args[1] != tcgv_ptr_arg(tcg_env)) {
-        fold_tcg_st(ctx, op);
-        return false;
+        return fold_tcg_st(ctx, op);
     }
 
     src = arg_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
     last = ofs + tcg_type_size(type) - 1;
     remove_mem_copy_in(ctx, ofs, last);
     record_mem_copy(ctx, type, src, ofs, last);
-    return false;
+    return true;
 }
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Change return from bool to int; distinguish between
complete folding, simplification, and no change.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return finish_folding(ctx, op);
 }
 
-static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
+/* Return 1 if finished, -1 if simplified, 0 if unchanged. */
+static int fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
 {
     uint64_t a_zmask, b_val;
     TCGCond cond;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond_zmask(OptContext *ctx, TCGOp *op, bool neg)
                 op->opc = xor_opc;
                 op->args[2] = arg_new_constant(ctx, 1);
             }
-            return false;
+            return -1;
         }
     }
-
-    return false;
+    return 0;
 }
 
 static void fold_setcond_tst_pow2(OptContext *ctx, TCGOp *op, bool neg)
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
 
-    if (fold_setcond_zmask(ctx, op, false)) {
+    i = fold_setcond_zmask(ctx, op, false);
+    if (i > 0) {
         return true;
     }
-    fold_setcond_tst_pow2(ctx, op, false);
+    if (i == 0) {
+        fold_setcond_tst_pow2(ctx, op, false);
+    }
 
     ctx->z_mask = 1;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_negsetcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_movi(ctx, op, op->args[0], -i);
     }
 
-    if (fold_setcond_zmask(ctx, op, true)) {
+    i = fold_setcond_zmask(ctx, op, true);
+    if (i > 0) {
         return true;
     }
-    fold_setcond_tst_pow2(ctx, op, true);
+    if (i == 0) {
+        fold_setcond_tst_pow2(ctx, op, true);
+    }
 
     /* Value is {0,-1} so all bits are repetitions of the sign. */
     ctx->s_mask = -1;
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask, s_mask_old;
+    TempOptInfo *t1 = arg_info(op->args[1]);
     int pos = op->args[2];
     int len = op->args[3];
 
-    if (arg_is_const(op->args[1])) {
-        uint64_t t;
-
-        t = arg_info(op->args[1])->val;
-        t = sextract64(t, pos, len);
-        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    if (ti_is_const(t1)) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0],
+                                sextract64(ti_const_val(t1), pos, len));
     }
 
-    z_mask = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask, pos, len);
-    ctx->z_mask = z_mask;
-
-    s_mask_old = arg_info(op->args[1])->s_mask;
-    s_mask = sextract64(s_mask_old, pos, len);
-    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
-    ctx->s_mask = s_mask;
+    s_mask_old = t1->s_mask;
+    s_mask = s_mask_old >> pos;
+    s_mask |= -1ull << (len - 1);
 
     if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
-    return fold_masks(ctx, op);
+    z_mask = sextract64(t1->z_mask, pos, len);
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     uint64_t s_mask, z_mask, sign;
+    TempOptInfo *t1, *t2;
 
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    s_mask = arg_info(op->args[1])->s_mask;
-    z_mask = arg_info(op->args[1])->z_mask;
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    s_mask = t1->s_mask;
+    z_mask = t1->z_mask;
 
-    if (arg_is_const(op->args[2])) {
-        int sh = arg_info(op->args[2])->val;
-
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+    if (ti_is_const(t2)) {
+        int sh = ti_const_val(t2);
 
+        z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
         s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 
-        return fold_masks(ctx, op);
+        return fold_masks_zs(ctx, op, z_mask, s_mask);
     }
 
     switch (op->opc) {
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          * Arithmetic right shift will not reduce the number of
          * input sign repetitions.
          */
-        ctx->s_mask = s_mask;
-        break;
+        return fold_masks_s(ctx, op, s_mask);
     CASE_OP_32_64(shr):
         /*
          * If the sign bit is known zero, then logical right shift
-         * will not reduced the number of input sign repetitions.
+         * will not reduce the number of input sign repetitions.
          */
-        sign = (s_mask & -s_mask) >> 1;
+        sign = -s_mask;
         if (sign && !(z_mask & sign)) {
-            ctx->s_mask = s_mask;
+            return fold_masks_s(ctx, op, s_mask);
         }
         break;
     default:
         break;
     }
 
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Merge the two conditions, sign != 0 && !(z_mask & sign),
by testing ~z_mask & sign.   If sign == 0, the logical and
will produce false.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Duplicate fold_sub_vec into fold_sub instead of calling it,
now that fold_sub_vec always returns true.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) || fold_sub_vec(ctx, op)) {
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
                    ? INDEX_op_add_i32 : INDEX_op_add_i64);
         op->args[2] = arg_new_constant(ctx, -val);
     }
-    return false;
+    return finish_folding(ctx, op);
 }
 
 static bool fold_sub2(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2(OptContext *ctx, TCGOp *op)
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask = -1, s_mask = 0;
+
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
     CASE_OP_32_64(ld8s):
-        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        s_mask = INT8_MIN;
         break;
     CASE_OP_32_64(ld8u):
-        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        z_mask = MAKE_64BIT_MASK(0, 8);
         break;
     CASE_OP_32_64(ld16s):
-        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
+        s_mask = INT16_MIN;
         break;
     CASE_OP_32_64(ld16u):
-        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        z_mask = MAKE_64BIT_MASK(0, 16);
         break;
     case INDEX_op_ld32s_i64:
-        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
+        s_mask = INT32_MIN;
         break;
     case INDEX_op_ld32u_i64:
-        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        z_mask = MAKE_64BIT_MASK(0, 32);
         break;
     default:
         g_assert_not_reached();
     }
-    return false;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_tcg_ld_memcopy(OptContext *ctx, TCGOp *op)
-- 
2.43.0

Avoid the use of the OptContext slots.  Find TempOptInfo once.
Remove fold_masks as the function becomes unused.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_masks_s(OptContext *ctx, TCGOp *op, uint64_t s_mask)
     return fold_masks_zs(ctx, op, -1, s_mask);
 }
 
-static bool fold_masks(OptContext *ctx, TCGOp *op)
-{
-    return fold_masks_zs(ctx, op, ctx->z_mask, ctx->s_mask);
-}
-
 /*
  * An "affected" mask bit is 0 if and only if the result is identical
  * to the first input.  Thus if the entire mask is 0, the operation
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_st_memcopy(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, s_mask;
+    TempOptInfo *t1, *t2;
+
     if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         return true;
     }
 
-    ctx->z_mask = arg_info(op->args[1])->z_mask
-                | arg_info(op->args[2])->z_mask;
-    ctx->s_mask = arg_info(op->args[1])->s_mask
-                & arg_info(op->args[2])->s_mask;
-    return fold_masks(ctx, op);
+    t1 = arg_info(op->args[1]);
+    t2 = arg_info(op->args[2]);
+    z_mask = t1->z_mask | t2->z_mask;
+    s_mask = t1->s_mask & t2->s_mask;
+    return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
 static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
-- 
2.43.0

All mask setting is now done with parameters via fold_masks_*.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 13 -------------
 1 file changed, 13 deletions(-)

All instances of s_mask have been converted to the new
representation.  We can now re-enable usage.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
         g_assert_not_reached();
     }
 
-    if (0 && !type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (!type_change && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
     s_mask = s_mask_old >> pos;
     s_mask |= -1ull << (len - 1);
 
-    if (0 && pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
+    if (pos == 0 && fold_affected_mask(ctx, op, s_mask & ~s_mask_old)) {
         return true;
     }
 
-- 
2.43.0

The big comment just above says functions should be sorted.
Add forward declarations as needed.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 114 +++++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 55 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  *   3) those that produce information about the result value.
  */
 
+static bool fold_or(OptContext *ctx, TCGOp *op);
+static bool fold_orc(OptContext *ctx, TCGOp *op);
+static bool fold_xor(OptContext *ctx, TCGOp *op);
+
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2_commutative(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
+static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
+{
+    /* If true and false values are the same, eliminate the cmp. */
+    if (args_are_copies(op->args[2], op->args[3])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
+    }
+
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint64_t tv = arg_info(op->args[2])->val;
+        uint64_t fv = arg_info(op->args[3])->val;
+
+        if (tv == -1 && fv == 0) {
+            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+        }
+        if (tv == 0 && fv == -1) {
+            if (TCG_TARGET_HAS_not_vec) {
+                op->opc = INDEX_op_not_vec;
+                return fold_not(ctx, op);
+            } else {
+                op->opc = INDEX_op_xor_vec;
+                op->args[2] = arg_new_constant(ctx, -1);
+                return fold_xor(ctx, op);
+            }
+        }
+    }
+    if (arg_is_const(op->args[2])) {
+        uint64_t tv = arg_info(op->args[2])->val;
+        if (tv == -1) {
+            op->opc = INDEX_op_or_vec;
+            op->args[2] = op->args[3];
+            return fold_or(ctx, op);
+        }
+        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
+            op->opc = INDEX_op_andc_vec;
+            op->args[2] = op->args[1];
+            op->args[1] = op->args[3];
+            return fold_andc(ctx, op);
+        }
+    }
+    if (arg_is_const(op->args[3])) {
+        uint64_t fv = arg_info(op->args[3])->val;
+        if (fv == 0) {
+            op->opc = INDEX_op_and_vec;
+            return fold_and(ctx, op);
+        }
+        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
+            op->opc = INDEX_op_orc_vec;
+            op->args[2] = op->args[1];
+            op->args[1] = op->args[3];
+            return fold_orc(ctx, op);
+        }
+    }
+    return finish_folding(ctx, op);
+}
+
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     int i = do_constant_folding_cond1(ctx, op, NO_DEST, &op->args[0],
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
     return fold_masks_zs(ctx, op, z_mask, s_mask);
 }
 
-static bool fold_bitsel_vec(OptContext *ctx, TCGOp *op)
-{
-    /* If true and false values are the same, eliminate the cmp. */
-    if (args_are_copies(op->args[2], op->args[3])) {
-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
-    }
-
-    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint64_t tv = arg_info(op->args[2])->val;
-        uint64_t fv = arg_info(op->args[3])->val;
-
-        if (tv == -1 && fv == 0) {
-            return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-        }
-        if (tv == 0 && fv == -1) {
-            if (TCG_TARGET_HAS_not_vec) {
-                op->opc = INDEX_op_not_vec;
-                return fold_not(ctx, op);
-            } else {
-                op->opc = INDEX_op_xor_vec;
-                op->args[2] = arg_new_constant(ctx, -1);
-                return fold_xor(ctx, op);
-            }
-        }
-    }
-    if (arg_is_const(op->args[2])) {
-        uint64_t tv = arg_info(op->args[2])->val;
-        if (tv == -1) {
-            op->opc = INDEX_op_or_vec;
-            op->args[2] = op->args[3];
-            return fold_or(ctx, op);
-        }
-        if (tv == 0 && TCG_TARGET_HAS_andc_vec) {
-            op->opc = INDEX_op_andc_vec;
-            op->args[2] = op->args[1];
-            op->args[1] = op->args[3];
-            return fold_andc(ctx, op);
-        }
-    }
-    if (arg_is_const(op->args[3])) {
-        uint64_t fv = arg_info(op->args[3])->val;
-        if (fv == 0) {
-            op->opc = INDEX_op_and_vec;
-            return fold_and(ctx, op);
-        }
-        if (fv == -1 && TCG_TARGET_HAS_orc_vec) {
-            op->opc = INDEX_op_orc_vec;
-            op->args[2] = op->args[1];
-            op->args[1] = op->args[3];
-            return fold_orc(ctx, op);
-        }
-    }
-    return finish_folding(ctx, op);
-}
-
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-- 
2.43.0

The big comment just above says functions should be sorted.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 60 +++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
+{
+    /* Canonicalize the comparison to put immediate second. */
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[3] = tcg_swap_cond(op->args[3]);
+    }
+    return finish_folding(ctx, op);
+}
+
+static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
+{
+    /* If true and false values are the same, eliminate the cmp. */
+    if (args_are_copies(op->args[3], op->args[4])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
+    }
+
+    /* Canonicalize the comparison to put immediate second. */
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = tcg_swap_cond(op->args[5]);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination,
+     * so that the tcg backend can implement "move if true".
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = tcg_invert_cond(op->args[5]);
+    }
+    return finish_folding(ctx, op);
+}
+
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
-static bool fold_cmp_vec(OptContext *ctx, TCGOp *op)
-{
-    /* Canonicalize the comparison to put immediate second. */
-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-        op->args[3] = tcg_swap_cond(op->args[3]);
-    }
-    return finish_folding(ctx, op);
-}
-
-static bool fold_cmpsel_vec(OptContext *ctx, TCGOp *op)
-{
-    /* If true and false values are the same, eliminate the cmp. */
-    if (args_are_copies(op->args[3], op->args[4])) {
-        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[3]);
-    }
-
-    /* Canonicalize the comparison to put immediate second. */
-    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
-        op->args[5] = tcg_swap_cond(op->args[5]);
-    }
-    /*
-     * Canonicalize the "false" input reg to match the destination,
-     * so that the tcg backend can implement "move if true".
-     */
-    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-        op->args[5] = tcg_invert_cond(op->args[5]);
-    }
-    return finish_folding(ctx, op);
-}
-
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask, s_mask, s_mask_old;
-- 
2.43.0

We currently have a flag, float_muladd_halve_result, to scale
the result by 2**-1.  Extend this to handle arbitrary scaling.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h   |  6 ++++
 fpu/softfloat.c           | 58 ++++++++++++++++++++++-----------------
 fpu/softfloat-parts.c.inc |  7 +++--
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ float16 float16_add(float16, float16, float_status *status);
 float16 float16_sub(float16, float16, float_status *status);
 float16 float16_mul(float16, float16, float_status *status);
 float16 float16_muladd(float16, float16, float16, int, float_status *status);
+float16 float16_muladd_scalbn(float16, float16, float16,
+                              int, int, float_status *status);
 float16 float16_div(float16, float16, float_status *status);
 float16 float16_scalbn(float16, int, float_status *status);
 float16 float16_min(float16, float16, float_status *status);
@@ -XXX,XX +XXX,XX @@ float32 float32_mul(float32, float32, float_status *status);
 float32 float32_div(float32, float32, float_status *status);
 float32 float32_rem(float32, float32, float_status *status);
 float32 float32_muladd(float32, float32, float32, int, float_status *status);
+float32 float32_muladd_scalbn(float32, float32, float32,
+                              int, int, float_status *status);
 float32 float32_sqrt(float32, float_status *status);
 float32 float32_exp2(float32, float_status *status);
 float32 float32_log2(float32, float_status *status);
@@ -XXX,XX +XXX,XX @@ float64 float64_mul(float64, float64, float_status *status);
 float64 float64_div(float64, float64, float_status *status);
 float64 float64_rem(float64, float64, float_status *status);
 float64 float64_muladd(float64, float64, float64, int, float_status *status);
+float64 float64_muladd_scalbn(float64, float64, float64,
+                              int, int, float_status *status);
 float64 float64_sqrt(float64, float_status *status);
 float64 float64_log2(float64, float_status *status);
 FloatRelation float64_compare(float64, float64, float_status *status);
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b,
 #define parts_mul(A, B, S) \
     PARTS_GENERIC_64_128(mul, A)(A, B, S)
 
-static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b,
-                                    FloatParts64 *c, int flags,
-                                    float_status *s);
-static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b,
-                                      FloatParts128 *c, int flags,
-                                      float_status *s);
+static FloatParts64 *parts64_muladd_scalbn(FloatParts64 *a, FloatParts64 *b,
+                                           FloatParts64 *c, int scale,
+                                           int flags, float_status *s);
+static FloatParts128 *parts128_muladd_scalbn(FloatParts128 *a, FloatParts128 *b,
+                                             FloatParts128 *c, int scale,
+                                             int flags, float_status *s);
 
-#define parts_muladd(A, B, C, Z, S) \
-    PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S)
+#define parts_muladd_scalbn(A, B, C, Z, Y, S) \
+    PARTS_GENERIC_64_128(muladd_scalbn, A)(A, B, C, Z, Y, S)
 
 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b,
                                  float_status *s);
@@ -XXX,XX +XXX,XX @@ floatx80_mul(floatx80 a, floatx80 b, float_status *status)
  * Fused multiply-add
  */
 
-float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
-                                    int flags, float_status *status)
+float16 QEMU_FLATTEN
+float16_muladd_scalbn(float16 a, float16 b, float16 c,
+                      int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
 
     float16_unpack_canonical(&pa, a, status);
     float16_unpack_canonical(&pb, b, status);
     float16_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
     return float16_round_pack_canonical(pr, status);
 }
 
-static float32 QEMU_SOFTFLOAT_ATTR
-soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
-                float_status *status)
+float16 float16_muladd(float16 a, float16 b, float16 c,
+                       int flags, float_status *status)
+{
+    return float16_muladd_scalbn(a, b, c, 0, flags, status);
+}
+
+float32 QEMU_SOFTFLOAT_ATTR
+float32_muladd_scalbn(float32 a, float32 b, float32 c,
+                      int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
 
     float32_unpack_canonical(&pa, a, status);
     float32_unpack_canonical(&pb, b, status);
     float32_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
     return float32_round_pack_canonical(pr, status);
 }
 
-static float64 QEMU_SOFTFLOAT_ATTR
-soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
-                float_status *status)
+float64 QEMU_SOFTFLOAT_ATTR
+float64_muladd_scalbn(float64 a, float64 b, float64 c,
+                      int scale, int flags, float_status *status)
 {
     FloatParts64 pa, pb, pc, *pr;
 
     float64_unpack_canonical(&pa, a, status);
     float64_unpack_canonical(&pb, b, status);
     float64_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, scale, flags, status);
 
     return float64_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     return ur.s;
 
  soft:
-    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
+    return float32_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
 }
 
 float64 QEMU_FLATTEN
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     return ur.s;
 
  soft:
-    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
+    return float64_muladd_scalbn(ua.s, ub.s, uc.s, 0, flags, s);
 }
 
 float64 float64r32_muladd(float64 a, float64 b, float64 c,
@@ -XXX,XX +XXX,XX @@ float64 float64r32_muladd(float64 a, float64 b, float64 c,
     float64_unpack_canonical(&pa, a, status);
     float64_unpack_canonical(&pb, b, status);
     float64_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
     return float64r32_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
     bfloat16_unpack_canonical(&pa, a, status);
     bfloat16_unpack_canonical(&pb, b, status);
     bfloat16_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
     return bfloat16_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c,
     float128_unpack_canonical(&pa, a, status);
     float128_unpack_canonical(&pb, b, status);
     float128_unpack_canonical(&pc, c, status);
-    pr = parts_muladd(&pa, &pb, &pc, flags, status);
+    pr = parts_muladd_scalbn(&pa, &pb, &pc, 0, flags, status);
 
     return float128_round_pack_canonical(pr, status);
 }
@@ -XXX,XX +XXX,XX @@ float32 float32_exp2(float32 a, float_status *status)
 
     float64_unpack_canonical(&rp, float64_one, status);
     for (i = 0 ; i < 15 ; i++) {
+
         float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status);
-        rp = *parts_muladd(&tp, &xnp, &rp, 0, status);
+        rp = *parts_muladd_scalbn(&tp, &xnp, &rp, 0, 0, status);
         xnp = *parts_mul(&xnp, &xp, status);
     }
 
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b,
  * Requires A and C extracted into a double-sized structure to provide the
  * extra space for the widening multiply.
  */
-static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
-                                   FloatPartsN *c, int flags, float_status *s)
+static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
+                                          FloatPartsN *c, int scale,
+                                          int flags, float_status *s)
 {
     int ab_mask, abc_mask;
     FloatPartsW p_widen, c_widen;
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd)(FloatPartsN *a, FloatPartsN *b,
     a->exp = p_widen.exp;
 
  return_normal:
+    /* TODO: Replace all use of float_muladd_halve_result with scale. */
     if (flags & float_muladd_halve_result) {
         a->exp -= 1;
     }
+    a->exp += scale;
  finish_sign:
     if (flags & float_muladd_negate_result) {
         a->sign ^= 1;
-- 
2.43.0

Use the scalbn interface instead of float_muladd_halve_result.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/tcg/helper-a64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/helper-a64.c
+++ b/target/arm/tcg/helper-a64.c
@@ -XXX,XX +XXX,XX @@ uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, float_status *fpst)
         (float16_is_infinity(b) && float16_is_zero(a))) {
         return float16_one_point_five;
     }
-    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
+    return float16_muladd_scalbn(a, b, float16_three, -1, 0, fpst);
 }
 
 float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
@@ -XXX,XX +XXX,XX @@ float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst)
         (float32_is_infinity(b) && float32_is_zero(a))) {
         return float32_one_point_five;
     }
-    return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
+    return float32_muladd_scalbn(a, b, float32_three, -1, 0, fpst);
 }
 
 float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
@@ -XXX,XX +XXX,XX @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst)
         (float64_is_infinity(b) && float64_is_zero(a))) {
         return float64_one_point_five;
     }
-    return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
+    return float64_muladd_scalbn(a, b, float64_three, -1, 0, fpst);
 }
 
 /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
-- 
2.43.0

Use the scalbn interface instead of float_muladd_halve_result.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sparc/helper.h     |  4 +-
 target/sparc/fop_helper.c |  8 ++--
 target/sparc/translate.c  | 80 +++++++++++++++++++++++----------------
 3 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/target/sparc/helper.h b/target/sparc/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/helper.h
+++ b/target/sparc/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(faddd, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fsubd, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fdivd, TCG_CALL_NO_WG, f64, env, f64, f64)
-DEF_HELPER_FLAGS_5(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, i32)
+DEF_HELPER_FLAGS_6(fmaddd, TCG_CALL_NO_WG, f64, env, f64, f64, f64, s32, i32)
 DEF_HELPER_FLAGS_3(fnaddd, TCG_CALL_NO_WG, f64, env, f64, f64)
 DEF_HELPER_FLAGS_3(fnmuld, TCG_CALL_NO_WG, f64, env, f64, f64)
 
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_FLAGS_3(fadds, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fsubs, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fdivs, TCG_CALL_NO_WG, f32, env, f32, f32)
-DEF_HELPER_FLAGS_5(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, i32)
+DEF_HELPER_FLAGS_6(fmadds, TCG_CALL_NO_WG, f32, env, f32, f32, f32, s32, i32)
 DEF_HELPER_FLAGS_3(fnadds, TCG_CALL_NO_WG, f32, env, f32, f32)
 DEF_HELPER_FLAGS_3(fnmuls, TCG_CALL_NO_WG, f32, env, f32, f32)
 
diff --git a/target/sparc/fop_helper.c b/target/sparc/fop_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/fop_helper.c
+++ b/target/sparc/fop_helper.c
@@ -XXX,XX +XXX,XX @@ Int128 helper_fsqrtq(CPUSPARCState *env, Int128 src)
 }
 
 float32 helper_fmadds(CPUSPARCState *env, float32 s1,
-                      float32 s2, float32 s3, uint32_t op)
+                      float32 s2, float32 s3, int32_t sc, uint32_t op)
 {
-    float32 ret = float32_muladd(s1, s2, s3, op, &env->fp_status);
+    float32 ret = float32_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
     check_ieee_exceptions(env, GETPC());
     return ret;
 }
 
 float64 helper_fmaddd(CPUSPARCState *env, float64 s1,
-                      float64 s2, float64 s3, uint32_t op)
+                      float64 s2, float64 s3, int32_t sc, uint32_t op)
 {
-    float64 ret = float64_muladd(s1, s2, s3, op, &env->fp_status);
+    float64 ret = float64_muladd_scalbn(s1, s2, s3, sc, op, &env->fp_status);
     check_ieee_exceptions(env, GETPC());
     return ret;
 }
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_op_fabsq(TCGv_i128 dst, TCGv_i128 src)
 
 static void gen_op_fmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
+    TCGv_i32 z = tcg_constant_i32(0);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, z);
 }
 
 static void gen_op_fmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(0));
+    TCGv_i32 z = tcg_constant_i32(0);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, z);
 }
 
 static void gen_op_fmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    int op = float_muladd_negate_c;
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    int op = float_muladd_negate_c;
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    int op = float_muladd_negate_c | float_muladd_negate_result;
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
+                                   float_muladd_negate_result);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    int op = float_muladd_negate_c | float_muladd_negate_result;
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c |
+                                   float_muladd_negate_result);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2, TCGv_i32 s3)
 {
-    int op = float_muladd_negate_result;
-    gen_helper_fmadds(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmadds(d, tcg_env, s1, s2, s3, z, op);
 }
 
 static void gen_op_fnmaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2, TCGv_i64 s3)
 {
-    int op = float_muladd_negate_result;
-    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, tcg_constant_i32(op));
+    TCGv_i32 z = tcg_constant_i32(0);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmaddd(d, tcg_env, s1, s2, s3, z, op);
 }
 
 /* Use muladd to compute (1 * src1) + src2 / 2 with one rounding. */
 static void gen_op_fhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
 {
-    TCGv_i32 one = tcg_constant_i32(float32_one);
-    int op = float_muladd_halve_result;
-    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i32 fone = tcg_constant_i32(float32_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(0);
+    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
 {
-    TCGv_i64 one = tcg_constant_i64(float64_one);
-    int op = float_muladd_halve_result;
-    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i64 fone = tcg_constant_i64(float64_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(0);
+    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 /* Use muladd to compute (1 * src1) - src2 / 2 with one rounding. */
 static void gen_op_fhsubs(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
 {
-    TCGv_i32 one = tcg_constant_i32(float32_one);
-    int op = float_muladd_negate_c | float_muladd_halve_result;
-    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i32 fone = tcg_constant_i32(float32_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fhsubd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
 {
-    TCGv_i64 one = tcg_constant_i64(float64_one);
-    int op = float_muladd_negate_c | float_muladd_halve_result;
-    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i64 fone = tcg_constant_i64(float64_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_c);
+    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 /* Use muladd to compute -((1 * src1) + src2 / 2) with one rounding. */
 static void gen_op_fnhadds(TCGv_i32 d, TCGv_i32 s1, TCGv_i32 s2)
 {
-    TCGv_i32 one = tcg_constant_i32(float32_one);
-    int op = float_muladd_negate_result | float_muladd_halve_result;
-    gen_helper_fmadds(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i32 fone = tcg_constant_i32(float32_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmadds(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fnhaddd(TCGv_i64 d, TCGv_i64 s1, TCGv_i64 s2)
 {
-    TCGv_i64 one = tcg_constant_i64(float64_one);
-    int op = float_muladd_negate_result | float_muladd_halve_result;
-    gen_helper_fmaddd(d, tcg_env, one, s1, s2, tcg_constant_i32(op));
+    TCGv_i64 fone = tcg_constant_i64(float64_one);
+    TCGv_i32 mone = tcg_constant_i32(-1);
+    TCGv_i32 op = tcg_constant_i32(float_muladd_negate_result);
+    gen_helper_fmaddd(d, tcg_env, fone, s1, s2, mone, op);
 }
 
 static void gen_op_fpexception_im(DisasContext *dc, int ftt)
-- 
2.43.0

All uses have been convered to float*_muladd_scalbn.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h   | 3 ---
 fpu/softfloat.c           | 6 ------
 fpu/softfloat-parts.c.inc | 4 ----
 3 files changed, 13 deletions(-)

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
 | Using these differs from negating an input or output before calling
 | the muladd function in that this means that a NaN doesn't have its
 | sign bit inverted before it is propagated.
-| We also support halving the result before rounding, as a special
-| case to support the ARM fused-sqrt-step instruction FRSQRTS.
 *----------------------------------------------------------------------------*/
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
     float_muladd_negate_result = 4,
-    float_muladd_halve_result = 8,
 };
 
 /*----------------------------------------------------------------------------
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
-    if (unlikely(flags & float_muladd_halve_result)) {
-        goto soft;
-    }
 
     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
@@ -XXX,XX +XXX,XX @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
-    if (unlikely(flags & float_muladd_halve_result)) {
-        goto soft;
-    }
 
     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
     a->exp = p_widen.exp;
 
  return_normal:
-    /* TODO: Replace all use of float_muladd_halve_result with scale. */
-    if (flags & float_muladd_halve_result) {
-        a->exp -= 1;
-    }
     a->exp += scale;
  finish_sign:
     if (flags & float_muladd_negate_result) {
-- 
2.43.0

This rounding mode is used by Hexagon.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-types.h | 2 ++
 fpu/softfloat-parts.c.inc     | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -XXX,XX +XXX,XX @@ typedef enum __attribute__((__packed__)) {
     float_round_to_odd       = 5,
     /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
     float_round_to_odd_inf   = 6,
+    /* Not an IEEE rounding mode: round to nearest even, overflow to max */
+    float_round_nearest_even_max = 7,
 } FloatRoundMode;
 
 /*
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
     int exp, flags = 0;
 
     switch (s->float_rounding_mode) {
+    case float_round_nearest_even_max:
+        overflow_norm = true;
+        /* fall through */
     case float_round_nearest_even:
         if (N > 64 && frac_lsb == 0) {
             inc = ((p->frac_hi & 1) || (p->frac_lo & round_mask) != frac_lsbm1
-- 
2.43.0

Certain Hexagon instructions suppress changes to the result
when the product of fma() is a true zero.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h   | 5 +++++
 fpu/softfloat.c           | 3 +++
 fpu/softfloat-parts.c.inc | 4 +++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -XXX,XX +XXX,XX @@ bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status);
 | Using these differs from negating an input or output before calling
 | the muladd function in that this means that a NaN doesn't have its
 | sign bit inverted before it is propagated.
+|
+| With float_muladd_suppress_add_product_zero, if A or B is zero
+| such that the product is a true zero, then return C without addition.
+| This preserves the sign of C when C is +/- 0.  Used for Hexagon.
 *----------------------------------------------------------------------------*/
 enum {
     float_muladd_negate_c = 1,
     float_muladd_negate_product = 2,
     float_muladd_negate_result = 4,
+    float_muladd_suppress_add_product_zero = 8,
 };
 
 /*----------------------------------------------------------------------------
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -XXX,XX +XXX,XX @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     if (unlikely(!can_use_fpu(s))) {
         goto soft;
     }
+    if (unlikely(flags & float_muladd_suppress_add_product_zero)) {
+        goto soft;
+    }
 
     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@@ -XXX,XX +XXX,XX @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b,
             goto return_normal;
         }
         if (c->cls == float_class_zero) {
-            if (a->sign != c->sign) {
+            if (flags & float_muladd_suppress_add_product_zero) {
+                a->sign = c->sign;
+            } else if (a->sign != c->sign) {
                 goto return_sub_zero;
             }
             goto return_zero;
-- 
2.43.0

There are no special cases for this instruction.
Remove internal_mpyf as unused.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.h   | 1 -
 target/hexagon/fma_emu.c   | 8 --------
 target/hexagon/op_helper.c | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.h
+++ b/target/hexagon/fma_emu.h
@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32);
 float32 infinite_float32(uint8_t sign);
 float32 internal_fmafx(float32 a, float32 b, float32 c,
                        int scale, float_status *fp_status);
-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status);
 float64 internal_mpyhh(float64 a, float64 b,
                        unsigned long long int accumulated,
                        float_status *fp_status);
diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
     return accum_round_float32(result, fp_status);
 }
 
-float32 internal_mpyf(float32 a, float32 b, float_status *fp_status)
-{
-    if (float32_is_zero(a) || float32_is_zero(b)) {
-        return float32_mul(a, b, fp_status);
-    }
-    return internal_fmafx(a, b, float32_zero, 0, fp_status);
-}
-
 float64 internal_mpyhh(float64 a, float64 b,
                       unsigned long long int accumulated,
                       float_status *fp_status)
diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -XXX,XX +XXX,XX @@ float32 HELPER(sfmpy)(CPUHexagonState *env, float32 RsV, float32 RtV)
 {
     float32 RdV;
     arch_fpop_start(env);
-    RdV = internal_mpyf(RsV, RtV, &env->fp_status);
+    RdV = float32_mul(RsV, RtV, &env->fp_status);
     arch_fpop_end(env);
     return RdV;
 }
-- 
2.43.0

There are no special cases for this instruction.  Since hexagon
always uses default-nan mode, explicitly negating the first
input is unnecessary.  Use float_muladd_negate_product instead.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

This instruction has a special case that 0 * x + c returns c
without the normal sign folding that comes with 0 + -0.
Use the new float_muladd_suppress_add_product_zero to
describe this.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -XXX,XX +XXX,XX @@ static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
 float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
                          float32 RsV, float32 RtV, float32 PuV)
 {
-    size4s_t tmp;
     arch_fpop_start(env);
-    RxV = check_nan(RxV, RxV, &env->fp_status);
-    RxV = check_nan(RxV, RsV, &env->fp_status);
-    RxV = check_nan(RxV, RtV, &env->fp_status);
-    tmp = internal_fmafx(RsV, RtV, RxV, fSXTN(8, 64, PuV), &env->fp_status);
-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
-        RxV = tmp;
-    }
+    RxV = float32_muladd_scalbn(RsV, RtV, RxV, fSXTN(8, 64, PuV),
+                                float_muladd_suppress_add_product_zero,
+                                &env->fp_status);
     arch_fpop_end(env);
     return RxV;
 }
-- 
2.43.0

There are multiple special cases for this instruction.
(1) The saturate to normal maximum instead of overflow to infinity is
    handled by the new float_round_nearest_even_max rounding mode.
(2) The 0 * n + c special case is handled by the new
    float_muladd_suppress_add_product_zero flag.
(3) The Inf - Inf -> 0 special case can be detected after the fact
    by examining float_flag_invalid_isi.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/op_helper.c | 105 +++++++++----------------------------
 1 file changed, 26 insertions(+), 79 deletions(-)

diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffma)(CPUHexagonState *env, float32 RxV,
     return RxV;
 }
 
-static bool is_zero_prod(float32 a, float32 b)
-{
-    return ((float32_is_zero(a) && is_finite(b)) ||
-            (float32_is_zero(b) && is_finite(a)));
-}
-
-static float32 check_nan(float32 dst, float32 x, float_status *fp_status)
-{
-    float32 ret = dst;
-    if (float32_is_any_nan(x)) {
-        if (extract32(x, 22, 1) == 0) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        ret = make_float32(0xffffffff);    /* nan */
-    }
-    return ret;
-}
-
 float32 HELPER(sffma_sc)(CPUHexagonState *env, float32 RxV,
                          float32 RsV, float32 RtV, float32 PuV)
 {
@@ -XXX,XX +XXX,XX @@ float32 HELPER(sffms)(CPUHexagonState *env, float32 RxV,
     return RxV;
 }
 
-static bool is_inf_prod(int32_t a, int32_t b)
+static float32 do_sffma_lib(CPUHexagonState *env, float32 RxV,
+                            float32 RsV, float32 RtV, int negate)
 {
-    return (float32_is_infinity(a) && float32_is_infinity(b)) ||
-           (float32_is_infinity(a) && is_finite(b) && !float32_is_zero(b)) ||
-           (float32_is_infinity(b) && is_finite(a) && !float32_is_zero(a));
+    int flags;
+
+    arch_fpop_start(env);
+
+    set_float_rounding_mode(float_round_nearest_even_max, &env->fp_status);
+    RxV = float32_muladd(RsV, RtV, RxV,
+                         negate | float_muladd_suppress_add_product_zero,
+                         &env->fp_status);
+
+    flags = get_float_exception_flags(&env->fp_status);
+    if (flags) {
+        /* Flags are suppressed by this instruction. */
+        set_float_exception_flags(0, &env->fp_status);
+
+        /* Return 0 for Inf - Inf. */
+        if (flags & float_flag_invalid_isi) {
+            RxV = 0;
+        }
+    }
+
+    arch_fpop_end(env);
+    return RxV;
 }
 
 float32 HELPER(sffma_lib)(CPUHexagonState *env, float32 RxV,
                           float32 RsV, float32 RtV)
 {
-    bool infinp;
-    bool infminusinf;
-    float32 tmp;
-
-    arch_fpop_start(env);
-    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
-    infminusinf = float32_is_infinity(RxV) &&
-                  is_inf_prod(RsV, RtV) &&
-                  (fGETBIT(31, RsV ^ RxV ^ RtV) != 0);
-    infinp = float32_is_infinity(RxV) ||
-             float32_is_infinity(RtV) ||
-             float32_is_infinity(RsV);
-    RxV = check_nan(RxV, RxV, &env->fp_status);
-    RxV = check_nan(RxV, RsV, &env->fp_status);
-    RxV = check_nan(RxV, RtV, &env->fp_status);
-    tmp = internal_fmafx(RsV, RtV, RxV, 0, &env->fp_status);
-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
-        RxV = tmp;
-    }
-    set_float_exception_flags(0, &env->fp_status);
-    if (float32_is_infinity(RxV) && !infinp) {
-        RxV = RxV - 1;
-    }
-    if (infminusinf) {
-        RxV = 0;
-    }
-    arch_fpop_end(env);
-    return RxV;
+    return do_sffma_lib(env, RxV, RsV, RtV, 0);
 }
 
 float32 HELPER(sffms_lib)(CPUHexagonState *env, float32 RxV,
                           float32 RsV, float32 RtV)
 {
-    bool infinp;
-    bool infminusinf;
-    float32 tmp;
-
-    arch_fpop_start(env);
-    set_float_rounding_mode(float_round_nearest_even, &env->fp_status);
-    infminusinf = float32_is_infinity(RxV) &&
-                  is_inf_prod(RsV, RtV) &&
-                  (fGETBIT(31, RsV ^ RxV ^ RtV) == 0);
-    infinp = float32_is_infinity(RxV) ||
-             float32_is_infinity(RtV) ||
-             float32_is_infinity(RsV);
-    RxV = check_nan(RxV, RxV, &env->fp_status);
-    RxV = check_nan(RxV, RsV, &env->fp_status);
-    RxV = check_nan(RxV, RtV, &env->fp_status);
-    float32 minus_RsV = float32_sub(float32_zero, RsV, &env->fp_status);
-    tmp = internal_fmafx(minus_RsV, RtV, RxV, 0, &env->fp_status);
-    if (!(float32_is_zero(RxV) && is_zero_prod(RsV, RtV))) {
-        RxV = tmp;
-    }
-    set_float_exception_flags(0, &env->fp_status);
-    if (float32_is_infinity(RxV) && !infinp) {
-        RxV = RxV - 1;
-    }
-    if (infminusinf) {
-        RxV = 0;
-    }
-    arch_fpop_end(env);
-    return RxV;
+    return do_sffma_lib(env, RxV, RsV, RtV, float_muladd_negate_product);
 }
 
 float64 HELPER(dfmpyfix)(CPUHexagonState *env, float64 RssV, float64 RttV)
-- 
2.43.0

The function is now unused.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.h |   2 -
 target/hexagon/fma_emu.c | 171 ---------------------------------------
 2 files changed, 173 deletions(-)

diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.h
+++ b/target/hexagon/fma_emu.h
@@ -XXX,XX +XXX,XX @@ static inline uint32_t float32_getexp_raw(float32 f32)
 }
 int32_t float32_getexp(float32 f32);
 float32 infinite_float32(uint8_t sign);
-float32 internal_fmafx(float32 a, float32 b, float32 c,
-                       int scale, float_status *fp_status);
 float64 internal_mpyhh(float64 a, float64 b,
                        unsigned long long int accumulated,
                        float_status *fp_status);
diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
     return -1;
 }
 
-static uint64_t float32_getmant(float32 f32)
-{
-    Float a = { .i = f32 };
-    if (float32_is_normal(f32)) {
-        return a.mant | 1ULL << 23;
-    }
-    if (float32_is_zero(f32)) {
-        return 0;
-    }
-    if (float32_is_denormal(f32)) {
-        return a.mant;
-    }
-    return ~0ULL;
-}
-
 int32_t float32_getexp(float32 f32)
 {
     Float a = { .i = f32 };
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
 }
 
 /* Return a maximum finite value with the requested sign */
-static float32 maxfinite_float32(uint8_t sign)
-{
-    if (sign) {
-        return make_float32(SF_MINUS_MAXF);
-    } else {
-        return make_float32(SF_MAXF);
-    }
-}
-
-/* Return a zero value with requested sign */
-static float32 zero_float32(uint8_t sign)
-{
-    if (sign) {
-        return make_float32(0x80000000);
-    } else {
-        return float32_zero;
-    }
-}
-
 #define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
 static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
 { \
@@ -XXX,XX +XXX,XX @@ static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
 }
 
 GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
-GEN_XF_ROUND(float32, SF_MANTBITS, SF_INF_EXP, Float)
-
-static bool is_inf_prod(float64 a, float64 b)
-{
-    return ((float64_is_infinity(a) && float64_is_infinity(b)) ||
-            (float64_is_infinity(a) && is_finite(b) && (!float64_is_zero(b))) ||
-            (float64_is_infinity(b) && is_finite(a) && (!float64_is_zero(a))));
-}
-
-static float64 special_fma(float64 a, float64 b, float64 c,
-                           float_status *fp_status)
-{
-    float64 ret = make_float64(0);
-
-    /*
-     * If A multiplied by B is an exact infinity and C is also an infinity
-     * but with the opposite sign, FMA returns NaN and raises invalid.
-     */
-    uint8_t a_sign = float64_is_neg(a);
-    uint8_t b_sign = float64_is_neg(b);
-    uint8_t c_sign = float64_is_neg(c);
-    if (is_inf_prod(a, b) && float64_is_infinity(c)) {
-        if ((a_sign ^ b_sign) != c_sign) {
-            ret = make_float64(DF_NAN);
-            float_raise(float_flag_invalid, fp_status);
-            return ret;
-        }
-    }
-    if ((float64_is_infinity(a) && float64_is_zero(b)) ||
-        (float64_is_zero(a) && float64_is_infinity(b))) {
-        ret = make_float64(DF_NAN);
-        float_raise(float_flag_invalid, fp_status);
-        return ret;
-    }
-    /*
-     * If none of the above checks are true and C is a NaN,
-     * a NaN shall be returned
-     * If A or B are NaN, a NAN shall be returned.
-     */
-    if (float64_is_any_nan(a) ||
-        float64_is_any_nan(b) ||
-        float64_is_any_nan(c)) {
-        if (float64_is_any_nan(a) && (fGETBIT(51, a) == 0)) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        if (float64_is_any_nan(b) && (fGETBIT(51, b) == 0)) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        if (float64_is_any_nan(c) && (fGETBIT(51, c) == 0)) {
-            float_raise(float_flag_invalid, fp_status);
-        }
-        ret = make_float64(DF_NAN);
-        return ret;
-    }
-    /*
-     * We have checked for adding opposite-signed infinities.
-     * Other infinities return infinity with the correct sign
-     */
-    if (float64_is_infinity(c)) {
-        ret = infinite_float64(c_sign);
-        return ret;
-    }
-    if (float64_is_infinity(a) || float64_is_infinity(b)) {
-        ret = infinite_float64(a_sign ^ b_sign);
-        return ret;
-    }
-    g_assert_not_reached();
-}
-
-static float32 special_fmaf(float32 a, float32 b, float32 c,
-                            float_status *fp_status)
-{
-    float64 aa, bb, cc;
-    aa = float32_to_float64(a, fp_status);
-    bb = float32_to_float64(b, fp_status);
-    cc = float32_to_float64(c, fp_status);
-    return float64_to_float32(special_fma(aa, bb, cc, fp_status), fp_status);
-}
-
-float32 internal_fmafx(float32 a, float32 b, float32 c, int scale,
-                       float_status *fp_status)
-{
-    Accum prod;
-    Accum acc;
-    Accum result;
-    accum_init(&prod);
-    accum_init(&acc);
-    accum_init(&result);
-
-    uint8_t a_sign = float32_is_neg(a);
-    uint8_t b_sign = float32_is_neg(b);
-    uint8_t c_sign = float32_is_neg(c);
-    if (float32_is_infinity(a) ||
-        float32_is_infinity(b) ||
-        float32_is_infinity(c)) {
-        return special_fmaf(a, b, c, fp_status);
-    }
-    if (float32_is_any_nan(a) ||
-        float32_is_any_nan(b) ||
-        float32_is_any_nan(c)) {
-        return special_fmaf(a, b, c, fp_status);
-    }
-    if ((scale == 0) && (float32_is_zero(a) || float32_is_zero(b))) {
-        float32 tmp = float32_mul(a, b, fp_status);
-        tmp = float32_add(tmp, c, fp_status);
-        return tmp;
-    }
-
-    /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
-    prod.mant = int128_mul_6464(float32_getmant(a), float32_getmant(b));
-
-    /*
-     * Note: extracting the mantissa into an int is multiplying by
-     * 2**23, so adjust here
-     */
-    prod.exp = float32_getexp(a) + float32_getexp(b) - SF_BIAS - 23;
-    prod.sign = a_sign ^ b_sign;
-    if (float32_is_zero(a) || float32_is_zero(b)) {
-        prod.exp = -2 * WAY_BIG_EXP;
-    }
-    if ((scale > 0) && float32_is_denormal(c)) {
-        acc.mant = int128_mul_6464(0, 0);
-        acc.exp = -WAY_BIG_EXP;
-        acc.sign = c_sign;
-        acc.sticky = 1;
-        result = accum_add(prod, acc);
-    } else if (!float32_is_zero(c)) {
-        acc.mant = int128_mul_6464(float32_getmant(c), 1);
-        acc.exp = float32_getexp(c);
-        acc.sign = c_sign;
-        result = accum_add(prod, acc);
-    } else {
-        result = prod;
-    }
-    result.exp += scale;
-    return accum_round_float32(result, fp_status);
-}
 
 float64 internal_mpyhh(float64 a, float64 b,
                       unsigned long long int accumulated,
-- 
2.43.0

This massive macro is now only used once.
Expand it for use only by float64.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 255 +++++++++++++++++++--------------------
 1 file changed, 127 insertions(+), 128 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
 }
 
 /* Return a maximum finite value with the requested sign */
-#define GEN_XF_ROUND(SUFFIX, MANTBITS, INF_EXP, INTERNAL_TYPE) \
-static SUFFIX accum_round_##SUFFIX(Accum a, float_status * fp_status) \
-{ \
-    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0) \
-        && ((a.guard | a.round | a.sticky) == 0)) { \
-        /* result zero */ \
-        switch (fp_status->float_rounding_mode) { \
-        case float_round_down: \
-            return zero_##SUFFIX(1); \
-        default: \
-            return zero_##SUFFIX(0); \
-        } \
-    } \
-    /* Normalize right */ \
-    /* We want MANTBITS bits of mantissa plus the leading one. */ \
-    /* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
-    /* So we need to normalize right while the high word is non-zero and \
-    * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
-    while ((int128_gethi(a.mant) != 0) || \
-           ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0)) { \
-        a = accum_norm_right(a, 1); \
-    } \
-    /* \
-     * OK, now normalize left \
-     * We want to normalize left until we have a leading one in bit 24 \
-     * Theoretically, we only need to shift a maximum of one to the left if we \
-     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky \
-     * should be 0  \
-     */ \
-    while ((int128_getlo(a.mant) & (1ULL << MANTBITS)) == 0) { \
-        a = accum_norm_left(a); \
-    } \
-    /* \
-     * OK, now we might need to denormalize because of potential underflow. \
-     * We need to do this before rounding, and rounding might make us normal \
-     * again \
-     */ \
-    while (a.exp <= 0) { \
-        a = accum_norm_right(a, 1 - a.exp); \
-        /* \
-         * Do we have underflow? \
-         * That's when we get an inexact answer because we ran out of bits \
-         * in a denormal. \
-         */ \
-        if (a.guard || a.round || a.sticky) { \
-            float_raise(float_flag_underflow, fp_status); \
-        } \
-    } \
-    /* OK, we're relatively canonical... now we need to round */ \
-    if (a.guard || a.round || a.sticky) { \
-        float_raise(float_flag_inexact, fp_status); \
-        switch (fp_status->float_rounding_mode) { \
-        case float_round_to_zero: \
-            /* Chop and we're done */ \
-            break; \
-        case float_round_up: \
-            if (a.sign == 0) { \
-                a.mant = int128_add(a.mant, int128_one()); \
-            } \
-            break; \
-        case float_round_down: \
-            if (a.sign != 0) { \
-                a.mant = int128_add(a.mant, int128_one()); \
-            } \
-            break; \
-        default: \
-            if (a.round || a.sticky) { \
-                /* round up if guard is 1, down if guard is zero */ \
-                a.mant = int128_add(a.mant, int128_make64(a.guard)); \
-            } else if (a.guard) { \
-                /* exactly .5, round up if odd */ \
-                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one())); \
-            } \
-            break; \
-        } \
-    } \
-    /* \
-     * OK, now we might have carried all the way up. \
-     * So we might need to shr once \
-     * at least we know that the lsb should be zero if we rounded and \
-     * got a carry out... \
-     */ \
-    if ((int128_getlo(a.mant) >> (MANTBITS + 1)) != 0) { \
-        a = accum_norm_right(a, 1); \
-    } \
-    /* Overflow? */ \
-    if (a.exp >= INF_EXP) { \
-        /* Yep, inf result */ \
-        float_raise(float_flag_overflow, fp_status); \
-        float_raise(float_flag_inexact, fp_status); \
-        switch (fp_status->float_rounding_mode) { \
-        case float_round_to_zero: \
-            return maxfinite_##SUFFIX(a.sign); \
-        case float_round_up: \
-            if (a.sign == 0) { \
-                return infinite_##SUFFIX(a.sign); \
-            } else { \
-                return maxfinite_##SUFFIX(a.sign); \
-            } \
-        case float_round_down: \
-            if (a.sign != 0) { \
-                return infinite_##SUFFIX(a.sign); \
-            } else { \
-                return maxfinite_##SUFFIX(a.sign); \
-            } \
-        default: \
-            return infinite_##SUFFIX(a.sign); \
-        } \
-    } \
-    /* Underflow? */ \
-    if (int128_getlo(a.mant) & (1ULL << MANTBITS)) { \
-        /* Leading one means: No, we're normal. So, we should be done... */ \
-        INTERNAL_TYPE ret; \
-        ret.i = 0; \
-        ret.sign = a.sign; \
-        ret.exp = a.exp; \
-        ret.mant = int128_getlo(a.mant); \
-        return ret.i; \
-    } \
-    assert(a.exp == 1); \
-    INTERNAL_TYPE ret; \
-    ret.i = 0; \
-    ret.sign = a.sign; \
-    ret.exp = 0; \
-    ret.mant = int128_getlo(a.mant); \
-    return ret.i; \
+static float64 accum_round_float64(Accum a, float_status *fp_status)
+{
+    if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
+        && ((a.guard | a.round | a.sticky) == 0)) {
+        /* result zero */
+        switch (fp_status->float_rounding_mode) {
+        case float_round_down:
+            return zero_float64(1);
+        default:
+            return zero_float64(0);
+        }
+    }
+    /*
+     * Normalize right
+     * We want DF_MANTBITS bits of mantissa plus the leading one.
+     * That means that we want DF_MANTBITS+1 bits, or 0x000000000000FF_FFFF
+     * So we need to normalize right while the high word is non-zero and
+     * while the low word is nonzero when masked with 0xffe0_0000_0000_0000
+     */
+    while ((int128_gethi(a.mant) != 0) ||
+           ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0)) {
+        a = accum_norm_right(a, 1);
+    }
+    /*
+     * OK, now normalize left
+     * We want to normalize left until we have a leading one in bit 24
+     * Theoretically, we only need to shift a maximum of one to the left if we
+     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky
+     * should be 0
+     */
+    while ((int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) == 0) {
+        a = accum_norm_left(a);
+    }
+    /*
+     * OK, now we might need to denormalize because of potential underflow.
+     * We need to do this before rounding, and rounding might make us normal
+     * again
+     */
+    while (a.exp <= 0) {
+        a = accum_norm_right(a, 1 - a.exp);
+        /*
+         * Do we have underflow?
+         * That's when we get an inexact answer because we ran out of bits
+         * in a denormal.
+         */
+        if (a.guard || a.round || a.sticky) {
+            float_raise(float_flag_underflow, fp_status);
+        }
+    }
+    /* OK, we're relatively canonical... now we need to round */
+    if (a.guard || a.round || a.sticky) {
+        float_raise(float_flag_inexact, fp_status);
+        switch (fp_status->float_rounding_mode) {
+        case float_round_to_zero:
+            /* Chop and we're done */
+            break;
+        case float_round_up:
+            if (a.sign == 0) {
+                a.mant = int128_add(a.mant, int128_one());
+            }
+            break;
+        case float_round_down:
+            if (a.sign != 0) {
+                a.mant = int128_add(a.mant, int128_one());
+            }
+            break;
+        default:
+            if (a.round || a.sticky) {
+                /* round up if guard is 1, down if guard is zero */
+                a.mant = int128_add(a.mant, int128_make64(a.guard));
+            } else if (a.guard) {
+                /* exactly .5, round up if odd */
+                a.mant = int128_add(a.mant, int128_and(a.mant, int128_one()));
+            }
+            break;
+        }
+    }
+    /*
+     * OK, now we might have carried all the way up.
+     * So we might need to shr once
+     * at least we know that the lsb should be zero if we rounded and
+     * got a carry out...
+     */
+    if ((int128_getlo(a.mant) >> (DF_MANTBITS + 1)) != 0) {
+        a = accum_norm_right(a, 1);
+    }
+    /* Overflow? */
+    if (a.exp >= DF_INF_EXP) {
+        /* Yep, inf result */
+        float_raise(float_flag_overflow, fp_status);
+        float_raise(float_flag_inexact, fp_status);
+        switch (fp_status->float_rounding_mode) {
+        case float_round_to_zero:
+            return maxfinite_float64(a.sign);
+        case float_round_up:
+            if (a.sign == 0) {
+                return infinite_float64(a.sign);
+            } else {
+                return maxfinite_float64(a.sign);
+            }
+        case float_round_down:
+            if (a.sign != 0) {
+                return infinite_float64(a.sign);
+            } else {
+                return maxfinite_float64(a.sign);
+            }
+        default:
+            return infinite_float64(a.sign);
+        }
+    }
+    /* Underflow? */
+    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
+        /* Leading one means: No, we're normal. So, we should be done... */
+        Double ret;
+        ret.i = 0;
+        ret.sign = a.sign;
+        ret.exp = a.exp;
+        ret.mant = int128_getlo(a.mant);
+        return ret.i;
+    }
+    assert(a.exp == 1);
+    Double ret;
+    ret.i = 0;
+    ret.sign = a.sign;
+    ret.exp = 0;
+    ret.mant = int128_getlo(a.mant);
+    return ret.i;
 }
 
-GEN_XF_ROUND(float64, DF_MANTBITS, DF_INF_EXP, Double)
-
 float64 internal_mpyhh(float64 a, float64 b,
                       unsigned long long int accumulated,
                       float_status *fp_status)
-- 
2.43.0

This structure, with bitfields, is incorrect for big-endian.
Use the existing float32_getexp_raw which uses extract32.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ typedef union {
     };
 } Double;
 
-typedef union {
-    float f;
-    uint32_t i;
-    struct {
-        uint32_t mant:23;
-        uint32_t exp:8;
-        uint32_t sign:1;
-    };
-} Float;
-
 static uint64_t float64_getmant(float64 f64)
 {
     Double a = { .i = f64 };
@@ -XXX,XX +XXX,XX @@ int32_t float64_getexp(float64 f64)
 
 int32_t float32_getexp(float32 f32)
 {
-    Float a = { .i = f32 };
+    int exp = float32_getexp_raw(f32);
     if (float32_is_normal(f32)) {
-        return a.exp;
+        return exp;
     }
     if (float32_is_denormal(f32)) {
-        return a.exp + 1;
+        return exp + 1;
     }
     return -1;
 }
-- 
2.43.0

This structure, with bitfields, is incorrect for big-endian.
Use extract64 and deposit64 instead.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 46 ++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@
 
 #define WAY_BIG_EXP 4096
 
-typedef union {
-    double f;
-    uint64_t i;
-    struct {
-        uint64_t mant:52;
-        uint64_t exp:11;
-        uint64_t sign:1;
-    };
-} Double;
-
 static uint64_t float64_getmant(float64 f64)
 {
-    Double a = { .i = f64 };
+    uint64_t mant = extract64(f64, 0, 52);
     if (float64_is_normal(f64)) {
-        return a.mant | 1ULL << 52;
+        return mant | 1ULL << 52;
     }
     if (float64_is_zero(f64)) {
         return 0;
     }
     if (float64_is_denormal(f64)) {
-        return a.mant;
+        return mant;
     }
     return ~0ULL;
 }
 
 int32_t float64_getexp(float64 f64)
 {
-    Double a = { .i = f64 };
+    int exp = extract64(f64, 52, 11);
     if (float64_is_normal(f64)) {
-        return a.exp;
+        return exp;
     }
     if (float64_is_denormal(f64)) {
-        return a.exp + 1;
+        return exp + 1;
     }
     return -1;
 }
@@ -XXX,XX +XXX,XX @@ float32 infinite_float32(uint8_t sign)
 /* Return a maximum finite value with the requested sign */
 static float64 accum_round_float64(Accum a, float_status *fp_status)
 {
+    uint64_t ret;
+
     if ((int128_gethi(a.mant) == 0) && (int128_getlo(a.mant) == 0)
         && ((a.guard | a.round | a.sticky) == 0)) {
         /* result zero */
@@ -XXX,XX +XXX,XX @@ static float64 accum_round_float64(Accum a, float_status *fp_status)
         }
     }
     /* Underflow? */
-    if (int128_getlo(a.mant) & (1ULL << DF_MANTBITS)) {
+    ret = int128_getlo(a.mant);
+    if (ret & (1ULL << DF_MANTBITS)) {
         /* Leading one means: No, we're normal. So, we should be done... */
-        Double ret;
-        ret.i = 0;
-        ret.sign = a.sign;
-        ret.exp = a.exp;
-        ret.mant = int128_getlo(a.mant);
-        return ret.i;
+        ret = deposit64(ret, 52, 11, a.exp);
+    } else {
+        assert(a.exp == 1);
+        ret = deposit64(ret, 52, 11, 0);
     }
-    assert(a.exp == 1);
-    Double ret;
-    ret.i = 0;
-    ret.sign = a.sign;
-    ret.exp = 0;
-    ret.mant = int128_getlo(a.mant);
-    return ret.i;
+    ret = deposit64(ret, 63, 1, a.sign);
+    return ret;
 }
 
 float64 internal_mpyhh(float64 a, float64 b,
-- 
2.43.0

No need to open-code 64x64->128-bit multiplication.

Reviewed-by: Brian Cain <brian.cain@oss.qualcomm.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/fma_emu.c | 32 +++-----------------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/fma_emu.c
+++ b/target/hexagon/fma_emu.c
@@ -XXX,XX +XXX,XX @@ int32_t float32_getexp(float32 f32)
     return -1;
 }
 
-static uint32_t int128_getw0(Int128 x)
-{
-    return int128_getlo(x);
-}
-
-static uint32_t int128_getw1(Int128 x)
-{
-    return int128_getlo(x) >> 32;
-}
-
 static Int128 int128_mul_6464(uint64_t ai, uint64_t bi)
 {
-    Int128 a, b;
-    uint64_t pp0, pp1a, pp1b, pp1s, pp2;
+    uint64_t l, h;
 
-    a = int128_make64(ai);
-    b = int128_make64(bi);
-    pp0 = (uint64_t)int128_getw0(a) * (uint64_t)int128_getw0(b);
-    pp1a = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw0(b);
-    pp1b = (uint64_t)int128_getw1(b) * (uint64_t)int128_getw0(a);
-    pp2 = (uint64_t)int128_getw1(a) * (uint64_t)int128_getw1(b);
-
-    pp1s = pp1a + pp1b;
-    if ((pp1s < pp1a) || (pp1s < pp1b)) {
-        pp2 += (1ULL << 32);
-    }
-    uint64_t ret_low = pp0 + (pp1s << 32);
-    if ((ret_low < pp0) || (ret_low < (pp1s << 32))) {
-        pp2 += 1;
-    }
-
-    return int128_make128(ret_low, pp2 + (pp1s >> 32));
+    mulu64(&l, &h, ai, bi);
+    return int128_make128(l, h);
 }
 
 static Int128 int128_sub_borrow(Int128 a, Int128 b, int borrow)
-- 
2.43.0

Convert all targets simultaneously, as the gen_intermediate_code
function disappears from the target.  While there are possible
workarounds, they're larger than simply performing the conversion.

diff --git a/include/exec/translator.h b/include/exec/translator.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/bswap.h"
 #include "exec/vaddr.h"
 
-/**
- * gen_intermediate_code
- * @cpu: cpu context
- * @tb: translation block
- * @max_insns: max number of instructions to translate
- * @pc: guest virtual program counter address
- * @host_pc: host physical program counter address
- *
- * This function must be provided by the target, which should create
- * the target-specific DisasContext, and then invoke translator_loop.
- */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc);
-
 /**
  * DisasJumpType:
  * @DISAS_NEXT: Next instruction in program order.
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -XXX,XX +XXX,XX @@ struct TCGCPUOps {
      * Called when the first CPU is realized.
      */
     void (*initialize)(void);
+    /**
+     * @translate_code: Translate guest instructions to TCGOps
+     * @cpu: cpu context
+     * @tb: translation block
+     * @max_insns: max number of instructions to translate
+     * @pc: guest virtual program counter address
+     * @host_pc: host physical program counter address
+     *
+     * This function must be provided by the target, which should create
+     * the target-specific DisasContext, and then invoke translator_loop.
+     */
+    void (*translate_code)(CPUState *cpu, TranslationBlock *tb,
+                           int *max_insns, vaddr pc, void *host_pc);
     /**
      * @synchronize_from_tb: Synchronize state from a TCG #TranslationBlock
      *
diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -XXX,XX +XXX,XX @@ enum {
 };
 
 void alpha_translate_init(void);
+void alpha_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
 
 #define CPU_RESOLVING_TYPE TYPE_ALPHA_CPU
 
diff --git a/target/arm/internals.h b/target/arm/internals.h
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -XXX,XX +XXX,XX @@ void init_cpreg_list(ARMCPU *cpu);
 
 void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu);
 void arm_translate_init(void);
+void arm_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 void arm_cpu_register_gdb_commands(ARMCPU *cpu);
 void aarch64_cpu_register_gdb_commands(ARMCPU *cpu, GString *,
diff --git a/target/avr/cpu.h b/target/avr/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.h
+++ b/target/avr/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void set_avr_feature(CPUAVRState *env, int feature)
 }
 
 void avr_cpu_tcg_init(void);
+void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc);
 
 int cpu_avr_exec(CPUState *cpu);
 
diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.h
+++ b/target/hexagon/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void cpu_get_tb_cpu_state(CPUHexagonState *env, vaddr *pc,
 typedef HexagonCPU ArchCPU;
 
 void hexagon_translate_init(void);
+void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc);
 
 #include "exec/cpu-all.h"
 
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline int HPPA_BTLB_ENTRIES(CPUHPPAState *env)
 }
 
 void hppa_translate_init(void);
+void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc);
 
 #define CPU_RESOLVING_TYPE TYPE_HPPA_CPU
 
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -XXX,XX +XXX,XX @@ static inline target_long lshift(target_long x, int n)
 
 /* translate.c */
 void tcg_x86_init(void);
+void x86_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 /* excp_helper.c */
 G_NORETURN void raise_exception(CPUX86State *env, int exception_index);
diff --git a/target/loongarch/internals.h b/target/loongarch/internals.h
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/internals.h
+++ b/target/loongarch/internals.h
@@ -XXX,XX +XXX,XX @@
 #define TARGET_VIRT_MASK MAKE_64BIT_MASK(0, TARGET_VIRT_ADDR_SPACE_BITS)
 
 void loongarch_translate_init(void);
+void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
+                              int *max_insns, vaddr pc, void *host_pc);
 
 void G_NORETURN do_raise_exception(CPULoongArchState *env,
                                    uint32_t exception,
diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -XXX,XX +XXX,XX @@ int m68k_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int m68k_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 
 void m68k_tcg_init(void);
+void m68k_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc);
 void m68k_cpu_init_gdb(M68kCPU *cpu);
 uint32_t cpu_m68k_get_ccr(CPUM68KState *env);
 void cpu_m68k_set_ccr(CPUM68KState *env, uint32_t);
diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.h
+++ b/target/microblaze/cpu.h
@@ -XXX,XX +XXX,XX @@ static inline void mb_cpu_write_msr(CPUMBState *env, uint32_t val)
 }
 
 void mb_tcg_init(void);
+void mb_translate_code(CPUState *cs, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc);
 
 #define CPU_RESOLVING_TYPE TYPE_MICROBLAZE_CPU
 
diff --git a/target/mips/tcg/tcg-internal.h b/target/mips/tcg/tcg-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/tcg-internal.h
+++ b/target/mips/tcg/tcg-internal.h
@@ -XXX,XX +XXX,XX @@
 #include "cpu.h"
 
 void mips_tcg_init(void);
+void mips_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc);
 
 void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb);
 G_NORETURN void mips_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.h
+++ b/target/openrisc/cpu.h
@@ -XXX,XX +XXX,XX @@ void openrisc_cpu_dump_state(CPUState *cpu, FILE *f, int flags);
 int openrisc_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int openrisc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 void openrisc_translate_init(void);
+void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
+                             int *max_insns, vaddr pc, void *host_pc);
 int print_insn_or1k(bfd_vma addr, disassemble_info *info);
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -XXX,XX +XXX,XX @@ extern const VMStateDescription vmstate_ppc_cpu;
 
 /*****************************************************************************/
 void ppc_translate_init(void);
+void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 #if !defined(CONFIG_USER_ONLY)
 void ppc_store_sdr1(CPUPPCState *env, target_ulong value);
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -XXX,XX +XXX,XX @@ RISCVException smstateen_acc_ok(CPURISCVState *env, int index, uint64_t bit);
 void riscv_cpu_set_mode(CPURISCVState *env, target_ulong newpriv, bool virt_en);
 
 void riscv_translate_init(void);
+void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
+
 G_NORETURN void riscv_raise_exception(CPURISCVState *env,
                                       uint32_t exception, uintptr_t pc);
 
diff --git a/target/rx/cpu.h b/target/rx/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.h
+++ b/target/rx/cpu.h
@@ -XXX,XX +XXX,XX @@ int rx_cpu_gdb_read_register(CPUState *cpu, GByteArray *buf, int reg);
 int rx_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
 
 void rx_translate_init(void);
+void rx_translate_code(CPUState *cs, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc);
 void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte);
 
 #include "exec/cpu-all.h"
diff --git a/target/s390x/s390x-internal.h b/target/s390x/s390x-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/s390x-internal.h
+++ b/target/s390x/s390x-internal.h
@@ -XXX,XX +XXX,XX @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, uint64_t r3,
 
 /* translate.c */
 void s390x_translate_init(void);
+void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
 void s390x_restore_state_to_opc(CPUState *cs,
                                 const TranslationBlock *tb,
                                 const uint64_t *data);
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
                                                uintptr_t retaddr);
 
 void sh4_translate_init(void);
+void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc);
 
 #if !defined(CONFIG_USER_ONLY)
 hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -XXX,XX +XXX,XX @@ int sparc_cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
 
 /* translate.c */
 void sparc_tcg_init(void);
+void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc);
 
 /* fop_helper.c */
 target_ulong cpu_get_fsr(CPUSPARCState *);
diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.h
+++ b/target/tricore/cpu.h
@@ -XXX,XX +XXX,XX @@ FIELD(TB_FLAGS, PRIV, 0, 2)
 
 void cpu_state_reset(CPUTriCoreState *s);
 void tricore_tcg_init(void);
+void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc);
 
 static inline void cpu_get_tb_cpu_state(CPUTriCoreState *env, vaddr *pc,
                                         uint64_t *cs_base, uint32_t *flags)
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -XXX,XX +XXX,XX @@ G_NORETURN void xtensa_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
 
 void xtensa_collect_sr_names(const XtensaConfig *config);
 void xtensa_translate_init(void);
+void xtensa_translate_code(CPUState *cs, TranslationBlock *tb,
+                           int *max_insns, vaddr pc, void *host_pc);
 void **xtensa_get_regfile_by_name(const char *name, int entries, int bits);
 void xtensa_breakpoint_handler(CPUState *cs);
 void xtensa_register_core(XtensaConfigList *node);
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_exec_realizefn(CPUState *cpu, Error **errp)
 
     if (!tcg_target_initialized) {
         /* Check mandatory TCGCPUOps handlers */
+        const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops;
 #ifndef CONFIG_USER_ONLY
-        assert(cpu->cc->tcg_ops->cpu_exec_halt);
-        assert(cpu->cc->tcg_ops->cpu_exec_interrupt);
+        assert(tcg_ops->cpu_exec_halt);
+        assert(tcg_ops->cpu_exec_interrupt);
 #endif /* !CONFIG_USER_ONLY */
-        cpu->cc->tcg_ops->initialize();
+        assert(tcg_ops->translate_code);
+        tcg_ops->initialize();
         tcg_target_initialized = true;
     }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
 
     tcg_func_start(tcg_ctx);
 
-    tcg_ctx->cpu = env_cpu(env);
-    gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
+    CPUState *cs = env_cpu(env);
+    tcg_ctx->cpu = cs;
+    cs->cc->tcg_ops->translate_code(cs, tb, max_insns, pc, host_pc);
+
     assert(tb->size != 0);
     tcg_ctx->cpu = NULL;
     *max_insns = tb->icount;
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
             /*
              * Overflow of code_gen_buffer, or the current slice of it.
              *
-             * TODO: We don't need to re-do gen_intermediate_code, nor
+             * TODO: We don't need to re-do tcg_ops->translate_code, nor
              * should we re-do the tcg optimization currently hidden
              * inside tcg_gen_code.  All that should be required is to
              * flush the TBs, allocate a new TB, re-initialize it per
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
 
 static const TCGCPUOps alpha_tcg_ops = {
     .initialize = alpha_translate_init,
+    .translate_code = alpha_translate_code,
     .synchronize_from_tb = alpha_cpu_synchronize_from_tb,
     .restore_state_to_opc = alpha_restore_state_to_opc,
 
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps alpha_tr_ops = {
     .tb_stop            = alpha_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void alpha_translate_code(CPUState *cpu, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &alpha_tr_ops, &dc.base);
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps arm_sysemu_ops = {
 #ifdef CONFIG_TCG
 static const TCGCPUOps arm_tcg_ops = {
     .initialize = arm_translate_init,
+    .translate_code = arm_translate_code,
     .synchronize_from_tb = arm_cpu_synchronize_from_tb,
     .debug_excp_handler = arm_debug_excp_handler,
     .restore_state_to_opc = arm_restore_state_to_opc,
diff --git a/target/arm/tcg/cpu-v7m.c b/target/arm/tcg/cpu-v7m.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/cpu-v7m.c
+++ b/target/arm/tcg/cpu-v7m.c
@@ -XXX,XX +XXX,XX @@ static void cortex_m55_initfn(Object *obj)
 
 static const TCGCPUOps arm_v7m_tcg_ops = {
     .initialize = arm_translate_init,
+    .translate_code = arm_translate_code,
     .synchronize_from_tb = arm_cpu_synchronize_from_tb,
     .debug_excp_handler = arm_debug_excp_handler,
     .restore_state_to_opc = arm_restore_state_to_opc,
diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps thumb_translator_ops = {
     .tb_stop            = arm_tr_tb_stop,
 };
 
-/* generate intermediate code for basic block 'tb'.  */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void arm_translate_code(CPUState *cpu, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = { };
     const TranslatorOps *ops = &arm_translator_ops;
diff --git a/target/avr/cpu.c b/target/avr/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps avr_sysemu_ops = {
 
 static const TCGCPUOps avr_tcg_ops = {
     .initialize = avr_cpu_tcg_init,
+    .translate_code = avr_cpu_translate_code,
     .synchronize_from_tb = avr_cpu_synchronize_from_tb,
     .restore_state_to_opc = avr_restore_state_to_opc,
     .cpu_exec_interrupt = avr_cpu_exec_interrupt,
diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@ static bool trans_WDR(DisasContext *ctx, arg_WDR *a)
  *
  *    - translate()
  *    - canonicalize_skip()
- *    - gen_intermediate_code()
+ *    - translate_code()
  *    - restore_state_to_opc()
  *
  */
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps avr_tr_ops = {
     .tb_stop            = avr_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void avr_cpu_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = { };
     translator_loop(cs, tb, max_insns, pc, host_pc, &avr_tr_ops, &dc.base);
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -XXX,XX +XXX,XX @@ static void hexagon_cpu_init(Object *obj)
 
 static const TCGCPUOps hexagon_tcg_ops = {
     .initialize = hexagon_translate_init,
+    .translate_code = hexagon_translate_code,
     .synchronize_from_tb = hexagon_cpu_synchronize_from_tb,
     .restore_state_to_opc = hexagon_restore_state_to_opc,
 };
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hexagon_tr_ops = {
     .tb_stop            = hexagon_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void hexagon_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps hppa_sysemu_ops = {
 
 static const TCGCPUOps hppa_tcg_ops = {
     .initialize = hppa_translate_init,
+    .translate_code = hppa_translate_code,
     .synchronize_from_tb = hppa_cpu_synchronize_from_tb,
     .restore_state_to_opc = hppa_restore_state_to_opc,
 
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps hppa_tr_ops = {
 #endif
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void hppa_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx = { };
     translator_loop(cs, tb, max_insns, pc, host_pc, &hppa_tr_ops, &ctx.base);
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static bool x86_debug_check_breakpoint(CPUState *cs)
 
 static const TCGCPUOps x86_tcg_ops = {
     .initialize = tcg_x86_init,
+    .translate_code = x86_translate_code,
     .synchronize_from_tb = x86_cpu_synchronize_from_tb,
     .restore_state_to_opc = x86_restore_state_to_opc,
     .cpu_exec_enter = x86_cpu_exec_enter,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps i386_tr_ops = {
     .tb_stop            = i386_tr_tb_stop,
 };
 
-/* generate intermediate code for basic block 'tb'.  */
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void x86_translate_code(CPUState *cpu, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
 
diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@@ -XXX,XX +XXX,XX @@ static void loongarch_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 
 static const TCGCPUOps loongarch_tcg_ops = {
     .initialize = loongarch_translate_init,
+    .translate_code = loongarch_translate_code,
     .synchronize_from_tb = loongarch_cpu_synchronize_from_tb,
     .restore_state_to_opc = loongarch_restore_state_to_opc,
 
diff --git a/target/loongarch/tcg/translate.c b/target/loongarch/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/loongarch/tcg/translate.c
+++ b/target/loongarch/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps loongarch_tr_ops = {
     .tb_stop            = loongarch_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void loongarch_translate_code(CPUState *cs, TranslationBlock *tb,
+                              int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps m68k_sysemu_ops = {
 
 static const TCGCPUOps m68k_tcg_ops = {
     .initialize = m68k_tcg_init,
+    .translate_code = m68k_translate_code,
     .restore_state_to_opc = m68k_restore_state_to_opc,
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps m68k_tr_ops = {
     .tb_stop            = m68k_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void m68k_translate_code(CPUState *cpu, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &m68k_tr_ops, &dc.base);
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps mb_sysemu_ops = {
 
 static const TCGCPUOps mb_tcg_ops = {
     .initialize = mb_tcg_init,
+    .translate_code = mb_translate_code,
     .synchronize_from_tb = mb_cpu_synchronize_from_tb,
     .restore_state_to_opc = mb_restore_state_to_opc,
 
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mb_tr_ops = {
     .tb_stop            = mb_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void mb_translate_code(CPUState *cpu, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
     translator_loop(cpu, tb, max_insns, pc, host_pc, &mb_tr_ops, &dc.base);
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -XXX,XX +XXX,XX @@ static const Property mips_cpu_properties[] = {
 #include "hw/core/tcg-cpu-ops.h"
 static const TCGCPUOps mips_tcg_ops = {
     .initialize = mips_tcg_init,
+    .translate_code = mips_translate_code,
     .synchronize_from_tb = mips_cpu_synchronize_from_tb,
     .restore_state_to_opc = mips_restore_state_to_opc,
 
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps mips_tr_ops = {
     .tb_stop            = mips_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void mips_translate_code(CPUState *cs, TranslationBlock *tb,
+                         int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps openrisc_sysemu_ops = {
 
 static const TCGCPUOps openrisc_tcg_ops = {
     .initialize = openrisc_translate_init,
+    .translate_code = openrisc_translate_code,
     .synchronize_from_tb = openrisc_cpu_synchronize_from_tb,
     .restore_state_to_opc = openrisc_restore_state_to_opc,
 
diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps openrisc_tr_ops = {
     .tb_stop            = openrisc_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void openrisc_translate_code(CPUState *cs, TranslationBlock *tb,
+                             int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps ppc_sysemu_ops = {
 
 static const TCGCPUOps ppc_tcg_ops = {
   .initialize = ppc_translate_init,
+  .translate_code = ppc_translate_code,
   .restore_state_to_opc = ppc_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps ppc_tr_ops = {
     .tb_stop            = ppc_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void ppc_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/tcg/tcg-cpu.c
+++ b/target/riscv/tcg/tcg-cpu.c
@@ -XXX,XX +XXX,XX @@ static void riscv_restore_state_to_opc(CPUState *cs,
 
 static const TCGCPUOps riscv_tcg_ops = {
     .initialize = riscv_translate_init,
+    .translate_code = riscv_translate_code,
     .synchronize_from_tb = riscv_cpu_synchronize_from_tb,
     .restore_state_to_opc = riscv_restore_state_to_opc,
 
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps riscv_tr_ops = {
     .tb_stop            = riscv_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void riscv_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/rx/cpu.c b/target/rx/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps rx_sysemu_ops = {
 
 static const TCGCPUOps rx_tcg_ops = {
     .initialize = rx_translate_init,
+    .translate_code = rx_translate_code,
     .synchronize_from_tb = rx_cpu_synchronize_from_tb,
     .restore_state_to_opc = rx_restore_state_to_opc,
     .tlb_fill = rx_cpu_tlb_fill,
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps rx_tr_ops = {
     .tb_stop            = rx_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void rx_translate_code(CPUState *cs, TranslationBlock *tb,
+                       int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
 
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -XXX,XX +XXX,XX @@ void cpu_get_tb_cpu_state(CPUS390XState *env, vaddr *pc,
 
 static const TCGCPUOps s390_tcg_ops = {
     .initialize = s390x_translate_init,
+    .translate_code = s390x_translate_code,
     .restore_state_to_opc = s390x_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps s390x_tr_ops = {
     .disas_log          = s390x_tr_disas_log,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void s390x_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc;
 
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sh4_sysemu_ops = {
 
 static const TCGCPUOps superh_tcg_ops = {
     .initialize = sh4_translate_init,
+    .translate_code = sh4_translate_code,
     .synchronize_from_tb = superh_cpu_synchronize_from_tb,
     .restore_state_to_opc = superh_restore_state_to_opc,
 
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sh4_tr_ops = {
     .tb_stop            = sh4_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void sh4_translate_code(CPUState *cs, TranslationBlock *tb,
+                        int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
 
diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps sparc_sysemu_ops = {
 
 static const TCGCPUOps sparc_tcg_ops = {
     .initialize = sparc_tcg_init,
+    .translate_code = sparc_translate_code,
     .synchronize_from_tb = sparc_cpu_synchronize_from_tb,
     .restore_state_to_opc = sparc_restore_state_to_opc,
 
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps sparc_tr_ops = {
     .tb_stop            = sparc_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void sparc_translate_code(CPUState *cs, TranslationBlock *tb,
+                          int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = {};
 
diff --git a/target/tricore/cpu.c b/target/tricore/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps tricore_sysemu_ops = {
 
 static const TCGCPUOps tricore_tcg_ops = {
     .initialize = tricore_tcg_init,
+    .translate_code = tricore_translate_code,
     .synchronize_from_tb = tricore_cpu_synchronize_from_tb,
     .restore_state_to_opc = tricore_restore_state_to_opc,
     .tlb_fill = tricore_cpu_tlb_fill,
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps tricore_tr_ops = {
     .tb_stop            = tricore_tr_tb_stop,
 };
 
-
-void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void tricore_translate_code(CPUState *cs, TranslationBlock *tb,
+                            int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext ctx;
     translator_loop(cs, tb, max_insns, pc, host_pc,
diff --git a/target/xtensa/cpu.c b/target/xtensa/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@@ -XXX,XX +XXX,XX @@ static const struct SysemuCPUOps xtensa_sysemu_ops = {
 
 static const TCGCPUOps xtensa_tcg_ops = {
     .initialize = xtensa_translate_init,
+    .translate_code = xtensa_translate_code,
     .debug_excp_handler = xtensa_breakpoint_handler,
     .restore_state_to_opc = xtensa_restore_state_to_opc,
 
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ static const TranslatorOps xtensa_translator_ops = {
     .tb_stop            = xtensa_tr_tb_stop,
 };
 
-void gen_intermediate_code(CPUState *cpu, TranslationBlock *tb, int *max_insns,
-                           vaddr pc, void *host_pc)
+void xtensa_translate_code(CPUState *cpu, TranslationBlock *tb,
+                           int *max_insns, vaddr pc, void *host_pc)
 {
     DisasContext dc = {};
     translator_loop(cpu, tb, max_insns, pc, host_pc,
-- 
2.43.0