Series comparison

-[PULL 00/15] tcg patch queue
+[PULL 00/56] tcg patch queue
-The following changes since commit 1cbd2d914939ee6028e9688d4ba859a528c28405:
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
-  Merge remote-tracking branch 'remotes/jasowang/tags/net-pull-request' into staging (2021-06-04 13:38:49 +0100)
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210604
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
-for you to fetch changes up to 0006039e29b9e6118beab300146f7c4931f7a217:
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-  tcg/arm: Implement TCG_TARGET_HAS_rotv_vec (2021-06-04 11:50:11 -0700)
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
 ----------------------------------------------------------------
-Host vector support for arm neon.
+Improvements to qemu/int128
 Fixes for 128/64 division.
 Cleanup tcg/optimize.c
 Optimize redundant sign extensions
 ----------------------------------------------------------------
-Richard Henderson (15):
+Frédéric Pétrot (1):
-      tcg: Change parameters for tcg_target_const_match
+      qemu/int128: Add int128_{not,xor}
       tcg/arm: Add host vector framework
       tcg/arm: Implement tcg_out_ld/st for vector types
       tcg/arm: Implement tcg_out_mov for vector types
       tcg/arm: Implement tcg_out_dup*_vec
       tcg/arm: Implement minimal vector operations
       tcg/arm: Implement andc, orc, abs, neg, not vector operations
       tcg/arm: Implement TCG_TARGET_HAS_shi_vec
       tcg/arm: Implement TCG_TARGET_HAS_mul_vec
       tcg/arm: Implement TCG_TARGET_HAS_sat_vec
       tcg/arm: Implement TCG_TARGET_HAS_minmax_vec
       tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec
       tcg/arm: Implement TCG_TARGET_HAS_shv_vec
       tcg/arm: Implement TCG_TARGET_HAS_roti_vec
       tcg/arm: Implement TCG_TARGET_HAS_rotv_vec
- tcg/arm/tcg-target-con-set.h |  10 +
+Luis Pires (4):
- tcg/arm/tcg-target-con-str.h |   3 +
+      host-utils: move checks out of divu128/divs128
- tcg/arm/tcg-target.h         |  52 ++-
+      host-utils: move udiv_qrnnd() to host-utils
- tcg/arm/tcg-target.opc.h     |  16 +
+      host-utils: add 128-bit quotient support to divu128/divs128
- tcg/tcg.c                    |   5 +-
+      host-utils: add unit tests for divu128/divs128
  tcg/aarch64/tcg-target.c.inc |   5 +-
  tcg/arm/tcg-target.c.inc     | 956 +++++++++++++++++++++++++++++++++++++++++--
  tcg/i386/tcg-target.c.inc    |   4 +-
  tcg/mips/tcg-target.c.inc    |   5 +-
  tcg/ppc/tcg-target.c.inc     |   4 +-
  tcg/riscv/tcg-target.c.inc   |   4 +-
  tcg/s390/tcg-target.c.inc    |   5 +-
  tcg/sparc/tcg-target.c.inc   |   5 +-
  tcg/tci/tcg-target.c.inc     |   6 +-
 files changed, 1001 insertions(+), 79 deletions(-)
  create mode 100644 tcg/arm/tcg-target.opc.h
+Richard Henderson (51):
+      tcg/optimize: Rename "mask" to "z_mask"
+      tcg/optimize: Split out OptContext
+      tcg/optimize: Remove do_default label
+      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+      tcg/optimize: Move prev_mb into OptContext
+      tcg/optimize: Split out init_arguments
+      tcg/optimize: Split out copy_propagate
+      tcg/optimize: Split out fold_call
+      tcg/optimize: Drop nb_oargs, nb_iargs locals
+      tcg/optimize: Change fail return for do_constant_folding_cond*
+      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
+      tcg/optimize: Split out finish_folding
+      tcg/optimize: Use a boolean to avoid a mass of continues
+      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+      tcg/optimize: Split out fold_const{1,2}
+      tcg/optimize: Split out fold_setcond2
+      tcg/optimize: Split out fold_brcond2
+      tcg/optimize: Split out fold_brcond
+      tcg/optimize: Split out fold_setcond
+      tcg/optimize: Split out fold_mulu2_i32
+      tcg/optimize: Split out fold_addsub2_i32
+      tcg/optimize: Split out fold_movcond
+      tcg/optimize: Split out fold_extract2
+      tcg/optimize: Split out fold_extract, fold_sextract
+      tcg/optimize: Split out fold_deposit
+      tcg/optimize: Split out fold_count_zeros
+      tcg/optimize: Split out fold_bswap
+      tcg/optimize: Split out fold_dup, fold_dup2
+      tcg/optimize: Split out fold_mov
+      tcg/optimize: Split out fold_xx_to_i
+      tcg/optimize: Split out fold_xx_to_x
+      tcg/optimize: Split out fold_xi_to_i
+      tcg/optimize: Add type to OptContext
+      tcg/optimize: Split out fold_to_not
+      tcg/optimize: Split out fold_sub_to_neg
+      tcg/optimize: Split out fold_xi_to_x
+      tcg/optimize: Split out fold_ix_to_i
+      tcg/optimize: Split out fold_masks
+      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
+      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
+      tcg/optimize: Sink commutative operand swapping into fold functions
+      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
+      tcg/optimize: Use fold_xx_to_i for orc
+      tcg/optimize: Use fold_xi_to_x for mul
+      tcg/optimize: Use fold_xi_to_x for div
+      tcg/optimize: Use fold_xx_to_i for rem
+      tcg/optimize: Optimize sign extensions
+      tcg/optimize: Propagate sign info for logical operations
+      tcg/optimize: Propagate sign info for setcond
+      tcg/optimize: Propagate sign info for bit counting
+      tcg/optimize: Propagate sign info for shifting
+ include/fpu/softfloat-macros.h |   82 --
+ include/hw/clock.h             |    5 +-
+ include/qemu/host-utils.h      |  121 +-
+ include/qemu/int128.h          |   20 +
+ target/ppc/int_helper.c        |   23 +-
+ tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
+ tests/unit/test-div128.c       |  197 +++
+ util/host-utils.c              |  147 ++-
+ tests/unit/meson.build         |    1 +
+files changed, 2053 insertions(+), 1187 deletions(-)
+ create mode 100644 tests/unit/test-div128.c

-[PULL 02/15] tcg/arm: Add host vector framework
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
-Add registers and function stubs.  The functionality
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 is disabled via use_neon_instructions defined to 0.
-We must still include results for the mandatory opcodes in
+Addition of not and xor on 128-bit integers.
 tcg_target_op_def, as all opcodes are checked during tcg init.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
 Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
 [rth: Split out logical operations.]
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target-con-set.h |   4 ++
+ include/qemu/int128.h | 20 ++++++++++++++++++++
- tcg/arm/tcg-target-con-str.h |   1 +
+file changed, 20 insertions(+)
  tcg/arm/tcg-target.h         |  48 ++++++++++++--
  tcg/arm/tcg-target.opc.h     |  12 ++++
  tcg/arm/tcg-target.c.inc     | 117 +++++++++++++++++++++++++++++------
 files changed, 158 insertions(+), 24 deletions(-)
  create mode 100644 tcg/arm/tcg-target.opc.h
-diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target-con-set.h
+--- a/include/qemu/int128.h
-+++ b/tcg/arm/tcg-target-con-set.h
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ C_O0_I1(r)
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
- C_O0_I2(r, r)
+     return a;
  C_O0_I2(r, rIN)
  C_O0_I2(s, s)
 +C_O0_I2(w, r)
  C_O0_I3(s, s, s)
  C_O0_I4(r, r, rI, rI)
  C_O0_I4(s, s, s, s)
  C_O1_I1(r, l)
  C_O1_I1(r, r)
 +C_O1_I1(w, r)
 +C_O1_I1(w, wr)
  C_O1_I2(r, 0, rZ)
  C_O1_I2(r, l, l)
  C_O1_I2(r, r, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIK)
  C_O1_I2(r, r, rIN)
  C_O1_I2(r, r, ri)
  C_O1_I2(r, rZ, rZ)
 +C_O1_I2(w, w, w)
  C_O1_I4(r, r, r, rI, rI)
  C_O1_I4(r, r, rIN, rIK, 0)
  C_O2_I1(r, r, l)
 diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target-con-str.h
 +++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
  REGS('r', ALL_GENERAL_REGS)
  REGS('l', ALL_QLOAD_REGS)
  REGS('s', ALL_QSTORE_REGS)
 +REGS('w', ALL_VECTOR_REGS)
  /*
   * Define constraint letters for constants:
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
      TCG_REG_R13,
      TCG_REG_R14,
      TCG_REG_PC,
 +
 +    TCG_REG_Q0,
 +    TCG_REG_Q1,
 +    TCG_REG_Q2,
 +    TCG_REG_Q3,
 +    TCG_REG_Q4,
 +    TCG_REG_Q5,
 +    TCG_REG_Q6,
 +    TCG_REG_Q7,
 +    TCG_REG_Q8,
 +    TCG_REG_Q9,
 +    TCG_REG_Q10,
 +    TCG_REG_Q11,
 +    TCG_REG_Q12,
 +    TCG_REG_Q13,
 +    TCG_REG_Q14,
 +    TCG_REG_Q15,
 +
 +    TCG_AREG0 = TCG_REG_R6,
 +    TCG_REG_CALL_STACK = TCG_REG_R13,
  } TCGReg;
 -#define TCG_TARGET_NB_REGS 16
 +#define TCG_TARGET_NB_REGS 32
  #ifdef __ARM_ARCH_EXT_IDIV__
  #define use_idiv_instructions  1
  #else
  extern bool use_idiv_instructions;
  #endif
 -
 +#define use_neon_instructions  0
  /* used for function call generation */
 -#define TCG_REG_CALL_STACK        TCG_REG_R13
  #define TCG_TARGET_STACK_ALIGN        8
  #define TCG_TARGET_CALL_ALIGN_ARGS    1
  #define TCG_TARGET_CALL_STACK_OFFSET    0
@@ -XXX,XX +XXX,XX @@ extern bool use_idiv_instructions;
  #define TCG_TARGET_HAS_direct_jump      0
  #define TCG_TARGET_HAS_qemu_st8_i32     0
 -enum {
 -    TCG_AREG0 = TCG_REG_R6,
 -};
 +#define TCG_TARGET_HAS_v64              use_neon_instructions
 +#define TCG_TARGET_HAS_v128             use_neon_instructions
 +#define TCG_TARGET_HAS_v256             0
 +
 +#define TCG_TARGET_HAS_andc_vec         0
 +#define TCG_TARGET_HAS_orc_vec          0
 +#define TCG_TARGET_HAS_not_vec          0
 +#define TCG_TARGET_HAS_neg_vec          0
 +#define TCG_TARGET_HAS_abs_vec          0
 +#define TCG_TARGET_HAS_roti_vec         0
 +#define TCG_TARGET_HAS_rots_vec         0
 +#define TCG_TARGET_HAS_rotv_vec         0
 +#define TCG_TARGET_HAS_shi_vec          0
 +#define TCG_TARGET_HAS_shs_vec          0
 +#define TCG_TARGET_HAS_shv_vec          0
 +#define TCG_TARGET_HAS_mul_vec          0
 +#define TCG_TARGET_HAS_sat_vec          0
 +#define TCG_TARGET_HAS_minmax_vec       0
 +#define TCG_TARGET_HAS_bitsel_vec       0
 +#define TCG_TARGET_HAS_cmpsel_vec       0
  #define TCG_TARGET_DEFAULT_MO (0)
  #define TCG_TARGET_HAS_MEMORY_BSWAP     1
 diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/arm/tcg-target.opc.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Copyright (c) 2019 Linaro
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or
 + * (at your option) any later version.
 + *
 + * See the COPYING file in the top-level directory for details.
 + *
 + * Target-specific opcodes for host vector expansion.  These will be
 + * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
 + * consider these to be UNSPEC with names.
 + */
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool use_idiv_instructions;
  #ifdef CONFIG_DEBUG_TCG
  static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 -    "%r0",
 -    "%r1",
 -    "%r2",
 -    "%r3",
 -    "%r4",
 -    "%r5",
 -    "%r6",
 -    "%r7",
 -    "%r8",
 -    "%r9",
 -    "%r10",
 -    "%r11",
 -    "%r12",
 -    "%r13",
 -    "%r14",
 -    "%pc",
 +    "%r0",  "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
 +    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%sp",  "%r14", "%pc",
 +    "%q0",  "%q1",  "%q2",  "%q3",  "%q4",  "%q5",  "%q6",  "%q7",
 +    "%q8",  "%q9",  "%q10", "%q11", "%q12", "%q13", "%q14", "%q15",
  };
  #endif
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
      TCG_REG_R3,
      TCG_REG_R12,
      TCG_REG_R14,
 +
 +    TCG_REG_Q0,
 +    TCG_REG_Q1,
 +    TCG_REG_Q2,
 +    TCG_REG_Q3,
 +    /* Q4 - Q7 are call-saved, and skipped. */
 +    TCG_REG_Q8,
 +    TCG_REG_Q9,
 +    TCG_REG_Q10,
 +    TCG_REG_Q11,
 +    TCG_REG_Q12,
 +    TCG_REG_Q13,
 +    TCG_REG_Q14,
 +    TCG_REG_Q15,
  };
  static const int tcg_target_call_iarg_regs[4] = {
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[2] = {
  };
  #define TCG_REG_TMP  TCG_REG_R12
 +#define TCG_VEC_TMP  TCG_REG_Q15
  enum arm_cond_code_e {
      COND_EQ = 0x0,
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
  #define TCG_CT_CONST_ZERO 0x800
  #define ALL_GENERAL_REGS  0xffffu
 +#define ALL_VECTOR_REGS   0xffff0000u
  /*
   * r0-r2 will be overwritten when reading the tlb entry (softmmu only)
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_qemu_st_i64:
          return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
 +    case INDEX_op_st_vec:
 +        return C_O0_I2(w, r);
 +    case INDEX_op_ld_vec:
 +    case INDEX_op_dupm_vec:
 +        return C_O1_I1(w, r);
 +    case INDEX_op_dup_vec:
 +        return C_O1_I1(w, wr);
 +    case INDEX_op_dup2_vec:
 +    case INDEX_op_add_vec:
 +    case INDEX_op_sub_vec:
 +    case INDEX_op_xor_vec:
 +    case INDEX_op_or_vec:
 +    case INDEX_op_and_vec:
 +    case INDEX_op_cmp_vec:
 +        return C_O1_I2(w, w, w);
 +
      default:
          g_assert_not_reached();
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
  {
      /* Only probe for the platform and capabilities if we havn't already
         determined maximum values at compile time.  */
 -#ifndef use_idiv_instructions
 +#if !defined(use_idiv_instructions) || !defined(use_neon_instructions)
      {
          unsigned long hwcap = qemu_getauxval(AT_HWCAP);
 +#ifndef use_idiv_instructions
          use_idiv_instructions = (hwcap & HWCAP_ARM_IDIVA) != 0;
 +#endif
 +#ifndef use_neon_instructions
 +        use_neon_instructions = (hwcap & HWCAP_ARM_NEON) != 0;
 +#endif
      }
  #endif
 +
      if (__ARM_ARCH < 7) {
          const char *pl = (const char *)qemu_getauxval(AT_PLATFORM);
          if (pl != NULL && pl[0] == 'v' && pl[1] >= '4' && pl[1] <= '9') {
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
          }
      }
 -    tcg_target_available_regs[TCG_TYPE_I32] = 0xffff;
 +    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
      tcg_target_call_clobber_regs = 0;
      tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R0);
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
      tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R12);
      tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R14);
 +    if (use_neon_instructions) {
 +        tcg_target_available_regs[TCG_TYPE_V64]  = ALL_VECTOR_REGS;
 +        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
 +
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q0);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q1);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q2);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q3);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q8);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q9);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q10);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q11);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q12);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q13);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q14);
 +        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q15);
 +    }
 +
      s->reserved_regs = 0;
      tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
      tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
      tcg_regset_set_reg(s->reserved_regs, TCG_REG_PC);
 +    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
  }
- static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
++static inline Int128 int128_not(Int128 a)
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
      tcg_out_movi32(s, COND_AL, ret, arg);
  }
 +static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 +                            TCGReg rd, TCGReg rs)
 +{
-+    g_assert_not_reached();
++    return ~a;
 +}
 +
-+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+ static inline Int128 int128_and(Int128 a, Int128 b)
-+                             TCGReg rd, TCGReg base, intptr_t offset)
+ {
      return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
      return a | b;
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
-+    g_assert_not_reached();
++    return a ^ b;
 +}
 +
-+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+ static inline Int128 int128_rshift(Int128 a, int n)
-+                             TCGReg rd, int64_t v64)
+ {
      return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
      return int128_make128(a, (a < 0) ? -1 : 0);
  }
 +static inline Int128 int128_not(Int128 a)
 +{
-+    g_assert_not_reached();
++    return int128_make128(~a.lo, ~a.hi);
 +}
 +
-+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+ static inline Int128 int128_and(Int128 a, Int128 b)
-+                           unsigned vecl, unsigned vece,
+ {
-+                           const TCGArg *args, const int *const_args)
+     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
      return int128_make128(a.lo | b.lo, a.hi | b.hi);
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
-+    g_assert_not_reached();
++    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
 +}
 +
-+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+ static inline Int128 int128_rshift(Int128 a, int n)
 +{
 +    return 0;
 +}
 +
 +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 +                       TCGArg a0, ...)
 +{
 +    g_assert_not_reached();
 +}
 +
  static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
  {
-     int i;
+     int64_t h;
 --
 .25.1

-New patch
+[PULL 02/56] host-utils: move checks out of divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |  5 +++--
  include/qemu/host-utils.h | 34 ++++++++++++---------------------
  target/ppc/int_helper.c   | 14 +++++++++-----
  util/host-utils.c         | 40 ++++++++++++++++++---------------------
 files changed, 42 insertions(+), 51 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
 --
 .25.1

-New patch
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
+From: Luis Pires <luis.pires@eldorado.org.br>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat-macros.h | 82 ----------------------------------
  include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 files changed, 81 insertions(+), 82 deletions(-)
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat-macros.h
 +++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
   * so some portions are provided under:
   *  the SoftFloat-2a license
   *  the BSD license
 - *  GPL-v2-or-later
   *
   * Any future contributions to this file after December 1st 2014 will be
   * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
   * THE POSSIBILITY OF SUCH DAMAGE.
   */
 -/* Portions of this work are licensed under the terms of the GNU GPL,
 - * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-New patch
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 These will be used to implement new decimal floating point
 instructions from Power ISA 3.1.
 The remainder is now returned directly by divu128/divs128,
 freeing up phigh to receive the high 64 bits of the quotient.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |   6 +-
  include/qemu/host-utils.h |  20 ++++--
  target/ppc/int_helper.c   |   9 +--
  util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 files changed, 108 insertions(+), 60 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
      if (clk->period == 0) {
          return 0;
      }
 -    /*
 -     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-New patch
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
  tests/unit/meson.build   |   1 +
 files changed, 198 insertions(+)
  create mode 100644 tests/unit/test-div128.c
 diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Test 128-bit division functions
 + *
 + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/host-utils.h"
 +
 +typedef struct {
 +    uint64_t high;
 +    uint64_t low;
 +    uint64_t rhigh;
 +    uint64_t rlow;
 +    uint64_t divisor;
 +    uint64_t remainder;
 +} test_data_unsigned;
 +
 +typedef struct {
 +    int64_t high;
 +    uint64_t low;
 +    int64_t rhigh;
 +    uint64_t rlow;
 +    int64_t divisor;
 +    int64_t remainder;
 +} test_data_signed;
 +
 +static const test_data_unsigned test_table_unsigned[] = {
 +    /* Dividend fits in 64 bits */
 +    { 0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000003ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000002ULL, 0x0000000000000001ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0xa000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000002ULL,
 +      0x4000000000000000ULL, 0x2000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x8000000000000000ULL, 0x0000000000000000ULL},
 +
 +    /* Dividend > 64 bits, with MSB 0 */
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x000000000000000dULL,
 +      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
 +      0x0000000000000010ULL, 0x0000000000000001ULL},
 +
 +    /* Dividend > 64 bits, with MSB 1 */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
 +      0x0000000000000010ULL, 0x000000000000000fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
 +      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
 +
 +    /**
 +     * Divisor == 64 bits, with MSB 1
 +     * and high 64 bits of dividend >= divisor
 +     * (for testing normalization)
 +     */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0xfddbb9977553310aULL,
 +      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
 +
 +    /* Dividend > 64 bits, divisor almost as big */
 +    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
 +      0x0000000000000000ULL, 0x000000000000000fULL,
 +      0x123456789abcdefeULL, 0x123456789abcde1fULL},
 +};
 +
 +static const test_data_signed test_table_signed[] = {
 +    /* Positive dividend, positive/negative divisors */
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
 +    int i;
 +    uint64_t rem;
 +    test_data_unsigned tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +static void test_divs128(void)
 +{
 +    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/host-utils/test_divu128", test_divu128);
 +    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/meson.build
 +++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
    # all code tested by test-x86-cpuid is inside topology.h
    'test-x86-cpuid': [],
    'test-cutils': [],
 +  'test-div128': [],
    'test-shift128': [],
    'test-mul64': [],
    # all code tested by test-int128 is inside int128.h
 --
 .25.1

-New patch
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
+Prepare for tracking different masks by renaming this one.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
+file changed, 72 insertions(+), 70 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     TCGTemp *prev_copy;
+     TCGTemp *next_copy;
+     uint64_t val;
+-    uint64_t mask;
++    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
+     ti->next_copy = ts;
+     ti->prev_copy = ts;
+     ti->is_const = false;
+-    ti->mask = -1;
++    ti->z_mask = -1;
+ }
+ static void reset_temp(TCGArg arg)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     if (ts->kind == TEMP_CONST) {
+         ti->is_const = true;
+         ti->val = ts->val;
+-        ti->mask = ts->val;
++        ti->z_mask = ts->val;
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+             /* High bits of a 32-bit quantity are garbage.  */
+-            ti->mask |= ~0xffffffffull;
++            ti->z_mask |= ~0xffffffffull;
+         }
+     } else {
+         ti->is_const = false;
+-        ti->mask = -1;
++        ti->z_mask = -1;
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     const TCGOpDef *def;
+     TempOptInfo *di;
+     TempOptInfo *si;
+-    uint64_t mask;
++    uint64_t z_mask;
+     TCGOpcode new_op;
+     if (ts_are_copies(dst_ts, src_ts)) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     op->args[0] = dst;
+     op->args[1] = src;
+-    mask = si->mask;
++    z_mask = si->z_mask;
+     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+         /* High bits of the destination are now garbage.  */
+-        mask |= ~0xffffffffull;
++        z_mask |= ~0xffffffffull;
+     }
+-    di->mask = mask;
++    di->z_mask = z_mask;
+     if (src_ts->type == dst_ts->type) {
+         TempOptInfo *ni = ts_info(si->next_copy);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     }
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+-        uint64_t mask, partmask, affected, tmp;
++        uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def = &tcg_op_defs[opc];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
+-        mask = -1;
++        z_mask = -1;
+         affected = -1;
+         switch (opc) {
+         CASE_OP_32_64(ext8s):
+-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         CASE_OP_32_64(ext8u):
+-            mask = 0xff;
++            z_mask = 0xff;
+             goto and_const;
+         CASE_OP_32_64(ext16s):
+-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         CASE_OP_32_64(ext16u):
+-            mask = 0xffff;
++            z_mask = 0xffff;
+             goto and_const;
+         case INDEX_op_ext32s_i64:
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         case INDEX_op_ext32u_i64:
+-            mask = 0xffffffffU;
++            z_mask = 0xffffffffU;
+             goto and_const;
+         CASE_OP_32_64(and):
+-            mask = arg_info(op->args[2])->mask;
++            z_mask = arg_info(op->args[2])->z_mask;
+             if (arg_is_const(op->args[2])) {
+         and_const:
+-                affected = arg_info(op->args[1])->mask & ~mask;
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+             }
+-            mask = arg_info(op->args[1])->mask & mask;
++            z_mask = arg_info(op->args[1])->z_mask & z_mask;
+             break;
+         case INDEX_op_ext_i32_i64:
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         case INDEX_op_extu_i32_i64:
+             /* We do not compute affected as it is a size changing op.  */
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
+             break;
+         CASE_OP_32_64(andc):
+             /* Known-zeros does not imply known-ones.  Therefore unless
+                op->args[2] is constant, we can't infer anything from it.  */
+             if (arg_is_const(op->args[2])) {
+-                mask = ~arg_info(op->args[2])->mask;
++                z_mask = ~arg_info(op->args[2])->z_mask;
+                 goto and_const;
+             }
+             /* But we certainly know nothing outside args[1] may be set. */
+-            mask = arg_info(op->args[1])->mask;
++            z_mask = arg_info(op->args[1])->z_mask;
+             break;
+         case INDEX_op_sar_i32:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 31;
+-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_sar_i64:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 63;
+-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_shr_i32:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 31;
+-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_shr_i64:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 63;
+-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_extrl_i64_i32:
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
+             break;
+         case INDEX_op_extrh_i64_i32:
+-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
++            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
+             break;
+         CASE_OP_32_64(shl):
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
+-                mask = arg_info(op->args[1])->mask << tmp;
++                z_mask = arg_info(op->args[1])->z_mask << tmp;
+             }
+             break;
+         CASE_OP_32_64(neg):
+             /* Set to 1 all bits to the left of the rightmost.  */
+-            mask = -(arg_info(op->args[1])->mask
+-                     & -arg_info(op->args[1])->mask);
++            z_mask = -(arg_info(op->args[1])->z_mask
++                       & -arg_info(op->args[1])->z_mask);
+             break;
+         CASE_OP_32_64(deposit):
+-            mask = deposit64(arg_info(op->args[1])->mask,
+-                             op->args[3], op->args[4],
+-                             arg_info(op->args[2])->mask);
++            z_mask = deposit64(arg_info(op->args[1])->z_mask,
++                               op->args[3], op->args[4],
++                               arg_info(op->args[2])->z_mask);
+             break;
+         CASE_OP_32_64(extract):
+-            mask = extract64(arg_info(op->args[1])->mask,
+-                             op->args[2], op->args[3]);
++            z_mask = extract64(arg_info(op->args[1])->z_mask,
++                               op->args[2], op->args[3]);
+             if (op->args[2] == 0) {
+-                affected = arg_info(op->args[1])->mask & ~mask;
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+             }
+             break;
+         CASE_OP_32_64(sextract):
+-            mask = sextract64(arg_info(op->args[1])->mask,
+-                              op->args[2], op->args[3]);
+-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
+-                affected = arg_info(op->args[1])->mask & ~mask;
++            z_mask = sextract64(arg_info(op->args[1])->z_mask,
++                                op->args[2], op->args[3]);
++            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+             }
+             break;
+         CASE_OP_32_64(or):
+         CASE_OP_32_64(xor):
+-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
++            z_mask = arg_info(op->args[1])->z_mask
++                   | arg_info(op->args[2])->z_mask;
+             break;
+         case INDEX_op_clz_i32:
+         case INDEX_op_ctz_i32:
+-            mask = arg_info(op->args[2])->mask | 31;
++            z_mask = arg_info(op->args[2])->z_mask | 31;
+             break;
+         case INDEX_op_clz_i64:
+         case INDEX_op_ctz_i64:
+-            mask = arg_info(op->args[2])->mask | 63;
++            z_mask = arg_info(op->args[2])->z_mask | 63;
+             break;
+         case INDEX_op_ctpop_i32:
+-            mask = 32 | 31;
++            z_mask = 32 | 31;
+             break;
+         case INDEX_op_ctpop_i64:
+-            mask = 64 | 63;
++            z_mask = 64 | 63;
+             break;
+         CASE_OP_32_64(setcond):
+         case INDEX_op_setcond2_i32:
+-            mask = 1;
++            z_mask = 1;
+             break;
+         CASE_OP_32_64(movcond):
+-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
++            z_mask = arg_info(op->args[3])->z_mask
++                   | arg_info(op->args[4])->z_mask;
+             break;
+         CASE_OP_32_64(ld8u):
+-            mask = 0xff;
++            z_mask = 0xff;
+             break;
+         CASE_OP_32_64(ld16u):
+-            mask = 0xffff;
++            z_mask = 0xffff;
+             break;
+         case INDEX_op_ld32u_i64:
+-            mask = 0xffffffffu;
++            z_mask = 0xffffffffu;
+             break;
+         CASE_OP_32_64(qemu_ld):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                 MemOp mop = get_memop(oi);
+                 if (!(mop & MO_SIGN)) {
+-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
++                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                 }
+             }
+             break;
+         CASE_OP_32_64(bswap16):
+-            mask = arg_info(op->args[1])->mask;
+-            if (mask <= 0xffff) {
++            z_mask = arg_info(op->args[1])->z_mask;
++            if (z_mask <= 0xffff) {
+                 op->args[2] |= TCG_BSWAP_IZ;
+             }
+-            mask = bswap16(mask);
++            z_mask = bswap16(z_mask);
+             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+             case TCG_BSWAP_OZ:
+                 break;
+             case TCG_BSWAP_OS:
+-                mask = (int16_t)mask;
++                z_mask = (int16_t)z_mask;
+                 break;
+             default: /* undefined high bits */
+-                mask |= MAKE_64BIT_MASK(16, 48);
++                z_mask |= MAKE_64BIT_MASK(16, 48);
+                 break;
+             }
+             break;
+         case INDEX_op_bswap32_i64:
+-            mask = arg_info(op->args[1])->mask;
+-            if (mask <= 0xffffffffu) {
++            z_mask = arg_info(op->args[1])->z_mask;
++            if (z_mask <= 0xffffffffu) {
+                 op->args[2] |= TCG_BSWAP_IZ;
+             }
+-            mask = bswap32(mask);
++            z_mask = bswap32(z_mask);
+             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+             case TCG_BSWAP_OZ:
+                 break;
+             case TCG_BSWAP_OS:
+-                mask = (int32_t)mask;
++                z_mask = (int32_t)z_mask;
+                 break;
+             default: /* undefined high bits */
+-                mask |= MAKE_64BIT_MASK(32, 32);
++                z_mask |= MAKE_64BIT_MASK(32, 32);
+                 break;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         /* 32-bit ops generate 32-bit results.  For the result is zero test
+            below, we can ignore high bits, but for further optimizations we
+            need to record that the high bits contain garbage.  */
+-        partmask = mask;
++        partmask = z_mask;
+         if (!(def->flags & TCG_OPF_64BIT)) {
+-            mask |= ~(tcg_target_ulong)0xffffffffu;
++            z_mask |= ~(tcg_target_ulong)0xffffffffu;
+             partmask &= 0xffffffffu;
+             affected &= 0xffffffffu;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                    vs the high word of the input.  */
+             do_setcond_high:
+                 reset_temp(op->args[0]);
+-                arg_info(op->args[0])->mask = 1;
++                arg_info(op->args[0])->z_mask = 1;
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+-                arg_info(op->args[0])->mask = 1;
++                arg_info(op->args[0])->z_mask = 1;
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             /* Default case: we know nothing about operation (or were unable
+                to compute the operation result) so no propagation is done.
+                We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "mask" is
++               block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+                 memset(&temps_used, 0, sizeof(temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     /* Save the corresponding known-zero bits mask for the
+                        first output argument (only one supported so far). */
+                     if (i == 0) {
+-                        arg_info(op->args[i])->mask = mask;
++                        arg_info(op->args[i])->z_mask = z_mask;
+                     }
+                 }
+             }
+--
+.25.1

-New patch
+[PULL 07/56] tcg/optimize: Split out OptContext
+Provide what will become a larger context for splitting
+the very large tcg_optimize function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
+file changed, 40 insertions(+), 37 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
++typedef struct OptContext {
++    TCGTempSet temps_used;
++} OptContext;
++
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+ {
+     return ts->state_ptr;
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+ }
+ /* Initialize and activate a temporary.  */
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+ {
+     size_t idx = temp_idx(ts);
+     TempOptInfo *ti;
+-    if (test_bit(idx, temps_used->l)) {
++    if (test_bit(idx, ctx->temps_used.l)) {
+         return;
+     }
+-    set_bit(idx, temps_used->l);
++    set_bit(idx, ctx->temps_used.l);
+     ti = ts->state_ptr;
+     if (ti == NULL) {
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     }
+ }
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
++static void init_arg_info(OptContext *ctx, TCGArg arg)
+ {
+-    init_ts_info(temps_used, arg_temp(arg));
++    init_ts_info(ctx, arg_temp(arg));
+ }
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     }
+ }
+-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
++static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
+                              TCGOp *op, TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+-    init_ts_info(temps_used, tv);
++    init_ts_info(ctx, tv);
+     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+     TCGOp *op, *op_next, *prev_mb = NULL;
+-    TCGTempSet temps_used;
++    OptContext ctx = {};
+     /* Array VALS has an element for each temp.
+        If this temp holds a constant then its value is kept in VALS' element.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     nb_temps = s->nb_temps;
+     nb_globals = s->nb_globals;
+-    memset(&temps_used, 0, sizeof(temps_used));
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                 TCGTemp *ts = arg_temp(op->args[i]);
+                 if (ts) {
+-                    init_ts_info(&temps_used, ts);
++                    init_ts_info(&ctx, ts);
+                 }
+             }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&temps_used, op->args[i]);
++                init_arg_info(&ctx, op->args[i]);
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(rotr):
+             if (arg_is_const(op->args[1])
+                 && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (partmask == 0) {
+             tcg_debug_assert(nb_oargs == 1);
+-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(mulsh):
+             if (arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+         CASE_OP_32_64_VEC(xor):
+             if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 TCGArg v = arg_info(op->args[1])->val;
+                 if (v != 0) {
+                     tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = deposit64(arg_info(op->args[1])->val,
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                            op->args[1], op->args[2]);
+             if (tmp != 2) {
+                 if (tmp) {
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[3];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+                 if (tmp) {
+             do_brcond_true:
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[5];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     goto do_default;
+                 }
+             do_brcond_low:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                             op->args[5]);
+             if (tmp != 2) {
+             do_setcond_const:
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+             } else if ((op->args[5] == TCG_COND_LT
+                         || op->args[5] == TCG_COND_GE)
+                        && arg_is_const(op->args[3])
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (!(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, temps_used.l)) {
++                    if (test_bit(i, ctx.temps_used.l)) {
+                         reset_ts(&s->temps[i]);
+                     }
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+             } else {
+         do_reset_output:
+                 for (i = 0; i < nb_oargs; i++) {
+--
+.25.1

-New patch
+[PULL 08/56] tcg/optimize: Remove do_default label
+Break the final cleanup clause out of the main switch
+statement.  When fully folding an opcode to mov/movi,
+use "continue" to process the next opcode, else break
+to fall into the final cleanup.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
+file changed, 94 insertions(+), 96 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+-            break;
++            continue;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+-                break;
++                continue;
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+                 nb_iargs = 1;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(not):
+         CASE_OP_32_64(neg):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(add):
+         CASE_OP_32_64(sub):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(deposit):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(sextract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract2):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(setcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(brcond):
+             tmp = do_constant_folding_cond(opc, op->args[0],
+                                            op->args[1], op->args[2]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[3];
+-                } else {
+-                    tcg_op_remove(s, op);
+-                }
++            switch (tmp) {
++            case 0:
++                tcg_op_remove(s, op);
++                continue;
++            case 1:
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[3];
+                 break;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(movcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[5]);
+             if (tmp != 2) {
+                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+-                break;
++                continue;
+             }
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+                 uint64_t tv = arg_info(op->args[3])->val;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (fv == 1 && tv == 0) {
+                     cond = tcg_invert_cond(cond);
+                 } else if (!(tv == 1 && fv == 0)) {
+-                    goto do_default;
++                    break;
+                 }
+                 op->args[3] = cond;
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                  : INDEX_op_setcond_i64);
+                 nb_iargs = 2;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_add2_i32:
+         case INDEX_op_sub2_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_mulu2_i32:
+             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_brcond2_i32:
+             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                             op->args[4]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-            do_brcond_true:
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[5];
+-                } else {
++            if (tmp == 0) {
+             do_brcond_false:
+-                    tcg_op_remove(s, op);
+-                }
+-            } else if ((op->args[4] == TCG_COND_LT
+-                        || op->args[4] == TCG_COND_GE)
+-                       && arg_is_const(op->args[2])
+-                       && arg_info(op->args[2])->val == 0
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0) {
++                tcg_op_remove(s, op);
++                continue;
++            }
++            if (tmp == 1) {
++            do_brcond_true:
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[5];
++                break;
++            }
++            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
++                 && arg_is_const(op->args[2])
++                 && arg_info(op->args[2])->val == 0
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
++                op->opc = opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_brcond_false;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_brcond_low:
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_brcond_true;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+             do_setcond_const:
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-            } else if ((op->args[5] == TCG_COND_LT
+-                        || op->args[5] == TCG_COND_GE)
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0
+-                       && arg_is_const(op->args[4])
+-                       && arg_info(op->args[4])->val == 0) {
++                continue;
++            }
++            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0
++                 && arg_is_const(op->args[4])
++                 && arg_info(op->args[4])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_setcond_high:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_setcond_high;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_setcond_const;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+-        case INDEX_op_call:
+-            if (!(tcg_call_flags(op)
++        default:
++            break;
++        }
++
++        /* Some of the folding above can change opc. */
++        opc = op->opc;
++        def = &tcg_op_defs[opc];
++        if (def->flags & TCG_OPF_BB_END) {
++            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++        } else {
++            if (opc == INDEX_op_call &&
++                !(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+                     if (test_bit(i, ctx.temps_used.l)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     }
+                 }
+             }
+-            goto do_reset_output;
+-        default:
+-        do_default:
+-            /* Default case: we know nothing about operation (or were unable
+-               to compute the operation result) so no propagation is done.
+-               We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "z_mask" is
+-               the non-zero bits mask for the first output arg.  */
+-            if (def->flags & TCG_OPF_BB_END) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-            } else {
+-        do_reset_output:
+-                for (i = 0; i < nb_oargs; i++) {
+-                    reset_temp(op->args[i]);
+-                    /* Save the corresponding known-zero bits mask for the
+-                       first output argument (only one supported so far). */
+-                    if (i == 0) {
+-                        arg_info(op->args[i])->z_mask = z_mask;
+-                    }
++            for (i = 0; i < nb_oargs; i++) {
++                reset_temp(op->args[i]);
++                /* Save the corresponding known-zero bits mask for the
++                   first output argument (only one supported so far). */
++                if (i == 0) {
++                    arg_info(op->args[i])->z_mask = z_mask;
+                 }
+             }
+-            break;
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+--
+.25.1

-New patch
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+Adjust the interface to take the OptContext parameter instead
 of TCGContext or both.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 file changed, 34 insertions(+), 33 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
  } TempOptInfo;
  typedef struct OptContext {
 +    TCGContext *tcg;
      TCGTempSet temps_used;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
      return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
 -static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
 -        tcg_op_remove(s, op);
 +        tcg_op_remove(ctx->tcg, op);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
 -static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
 -                             TCGOp *op, TCGArg dst, uint64_t val)
 +static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
 +                             TCGArg dst, uint64_t val)
  {
      const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
      init_ts_info(ctx, tv);
 -    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals, i;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    OptContext ctx = {};
 +    OptContext ctx = { .tcg = s };
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(or):
          CASE_OP_32_64_VEC(and):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             allocator where needed and possible.  Also detect copies. */
          switch (opc) {
          CASE_OP_32_64_VEC(mov):
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-New patch
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
+This will expose the variable to subroutines that
+will be broken out of tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+ typedef struct OptContext {
+     TCGContext *tcg;
++    TCGOp *prev_mb;
+     TCGTempSet temps_used;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+-    TCGOp *op, *op_next, *prev_mb = NULL;
++    TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+     /* Array VALS has an element for each temp.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+-        if (prev_mb) {
++        if (ctx.prev_mb) {
+             switch (opc) {
+             case INDEX_op_mb:
+                 /* Merge two barriers of the same type into one,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                  * barrier.  This is stricter than specified but for
+                  * the purposes of TCG is better than not optimizing.
+                  */
+-                prev_mb->args[0] |= op->args[0];
++                ctx.prev_mb->args[0] |= op->args[0];
+                 tcg_op_remove(s, op);
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i64:
+             case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+-                prev_mb = NULL;
++                ctx.prev_mb = NULL;
+                 break;
+             }
+         } else if (opc == INDEX_op_mb) {
+-            prev_mb = op;
++            ctx.prev_mb = op;
+         }
+     }
+ }
+--
+.25.1

-New patch
+[PULL 11/56] tcg/optimize: Split out init_arguments
+There was no real reason for calls to have separate code here.
+Unify init for calls vs non-calls using the call path, which
+handles TCG_CALL_DUMMY_ARG.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 25 +++++++++++--------------
+file changed, 11 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+     }
+ }
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
+-{
+-    init_ts_info(ctx, arg_temp(arg));
+-}
+-
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+ {
+     TCGTemp *i, *g, *l;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+     return false;
+ }
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
++{
++    for (int i = 0; i < nb_args; i++) {
++        TCGTemp *ts = arg_temp(op->args[i]);
++        if (ts) {
++            init_ts_info(ctx, ts);
++        }
++    }
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (opc == INDEX_op_call) {
+             nb_oargs = TCGOP_CALLO(op);
+             nb_iargs = TCGOP_CALLI(op);
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                TCGTemp *ts = arg_temp(op->args[i]);
+-                if (ts) {
+-                    init_ts_info(&ctx, ts);
+-                }
+-            }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&ctx, op->args[i]);
+-            }
+         }
++        init_arguments(&ctx, op, nb_oargs + nb_iargs);
+         /* Do copy propagation */
+         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+--
+.25.1

-New patch
+[PULL 12/56] tcg/optimize: Split out copy_propagate
+Continue splitting tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 22 ++++++++++++++--------
+file changed, 14 insertions(+), 8 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+     }
+ }
++static void copy_propagate(OptContext *ctx, TCGOp *op,
++                           int nb_oargs, int nb_iargs)
++{
++    TCGContext *s = ctx->tcg;
++
++    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
++        TCGTemp *ts = arg_temp(op->args[i]);
++        if (ts && ts_is_copy(ts)) {
++            op->args[i] = temp_arg(find_better_copy(s, ts));
++        }
++    }
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             nb_iargs = def->nb_iargs;
+         }
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-
+-        /* Do copy propagation */
+-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+-            TCGTemp *ts = arg_temp(op->args[i]);
+-            if (ts && ts_is_copy(ts)) {
+-                op->args[i] = temp_arg(find_better_copy(s, ts));
+-            }
+-        }
++        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+--
+.25.1

-New patch
+[PULL 13/56] tcg/optimize: Split out fold_call
+Calls are special in that they have a variable number
+of arguments, and need to be able to clobber globals.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
+file changed, 41 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static bool fold_call(OptContext *ctx, TCGOp *op)
++{
++    TCGContext *s = ctx->tcg;
++    int nb_oargs = TCGOP_CALLO(op);
++    int nb_iargs = TCGOP_CALLI(op);
++    int flags, i;
++
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
++    copy_propagate(ctx, op, nb_oargs, nb_iargs);
++
++    /* If the function reads or writes globals, reset temp data. */
++    flags = tcg_call_flags(op);
++    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
++        int nb_globals = s->nb_globals;
++
++        for (i = 0; i < nb_globals; i++) {
++            if (test_bit(i, ctx->temps_used.l)) {
++                reset_ts(&ctx->tcg->temps[i]);
++            }
++        }
++    }
++
++    /* Reset temp data for outputs. */
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++    }
++
++    /* Stop optimizing MB across calls. */
++    ctx->prev_mb = NULL;
++    return true;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+-    int nb_temps, nb_globals, i;
++    int nb_temps, i;
+     TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+        available through the doubly linked circular list. */
+     nb_temps = s->nb_temps;
+-    nb_globals = s->nb_globals;
+-
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+-        const TCGOpDef *def = &tcg_op_defs[opc];
++        const TCGOpDef *def;
+-        /* Count the arguments, and initialize the temps that are
+-           going to be used */
++        /* Calls are special. */
+         if (opc == INDEX_op_call) {
+-            nb_oargs = TCGOP_CALLO(op);
+-            nb_iargs = TCGOP_CALLI(op);
+-        } else {
+-            nb_oargs = def->nb_oargs;
+-            nb_iargs = def->nb_iargs;
++            fold_call(&ctx, op);
++            continue;
+         }
++
++        def = &tcg_op_defs[opc];
++        nb_oargs = def->nb_oargs;
++        nb_iargs = def->nb_iargs;
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
+-            if (opc == INDEX_op_call &&
+-                !(tcg_call_flags(op)
+-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+-                for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, ctx.temps_used.l)) {
+-                        reset_ts(&s->temps[i]);
+-                    }
+-                }
+-            }
+-
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i32:
+             case INDEX_op_qemu_st8_i32:
+             case INDEX_op_qemu_st_i64:
+-            case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+                 ctx.prev_mb = NULL;
+                 break;
+--
+.25.1

-New patch
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
+Rather than try to keep these up-to-date across folding,
+re-read nb_oargs at the end, after re-reading the opcode.
+A couple of asserts need dropping, but that will take care
+of itself as we split the function further.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 14 ++++----------
+file changed, 4 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+         uint64_t z_mask, partmask, affected, tmp;
+-        int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         def = &tcg_op_defs[opc];
+-        nb_oargs = def->nb_oargs;
+-        nb_iargs = def->nb_iargs;
+-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
++        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
++        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(qemu_ld):
+             {
+-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
++                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+                 MemOp mop = get_memop(oi);
+                 if (!(mop & MO_SIGN)) {
+                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         if (partmask == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             continue;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+-                nb_iargs = 1;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+                                  ? INDEX_op_setcond_i32
+                                  : INDEX_op_setcond_i64);
+-                nb_iargs = 2;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
++            int nb_oargs = def->nb_oargs;
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+--
+.25.1

-New patch
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
+Return -1 instead of 2 for failure, so that we can
 use comparisons against 0 for all cases.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 file changed, 74 insertions(+), 71 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
      }
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
 -                                       TCGArg y, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +                                    TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
          case TCG_COND_GEU:
              return 1;
          default:
 -            return 2;
 +            return -1;
          }
      }
 -    return 2;
 +    return -1;
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
  {
      TCGArg al = p1[0], ah = p1[1];
      TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
          return do_constant_folding_cond_eq(c);
      }
 -    return 2;
 +    return -1;
  }
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          CASE_OP_32_64(setcond):
 -            tmp = do_constant_folding_cond(opc, op->args[1],
 -                                           op->args[2], op->args[3]);
 -            if (tmp != 2) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[3]);
 +            if (i >= 0) {
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              break;
          CASE_OP_32_64(brcond):
 -            tmp = do_constant_folding_cond(opc, op->args[0],
 -                                           op->args[1], op->args[2]);
 -            switch (tmp) {
 -            case 0:
 +            i = do_constant_folding_cond(opc, op->args[0],
 +                                         op->args[1], op->args[2]);
 +            if (i == 0) {
                  tcg_op_remove(s, op);
                  continue;
 -            case 1:
 +            } else if (i > 0) {
                  memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          CASE_OP_32_64(movcond):
 -            tmp = do_constant_folding_cond(opc, op->args[1],
 -                                           op->args[2], op->args[5]);
 -            if (tmp != 2) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[5]);
 +            if (i >= 0) {
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          case INDEX_op_brcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
 -                                            op->args[4]);
 -            if (tmp == 0) {
 +            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
 +                                          op->args[4]);
 +            if (i == 0) {
              do_brcond_false:
                  tcg_op_remove(s, op);
                  continue;
              }
 -            if (tmp == 1) {
 +            if (i > 0) {
              do_brcond_true:
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
              }
              break;
          case INDEX_op_setcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                            op->args[5]);
 -            if (tmp != 2) {
 +            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 +                                          op->args[5]);
 +            if (i >= 0) {
              do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_const;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
              }
 --
 .25.1

-New patch
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
+This will allow callers to tail call to these functions
+and return true indicating processing complete.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 +++++----
+file changed, 5 insertions(+), 4 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
+ }
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+ {
+     TCGTemp *dst_ts = arg_temp(dst);
+     TCGTemp *src_ts = arg_temp(src);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+     if (ts_are_copies(dst_ts, src_ts)) {
+         tcg_op_remove(ctx->tcg, op);
+-        return;
++        return true;
+     }
+     reset_ts(dst_ts);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+         di->is_const = si->is_const;
+         di->val = si->val;
+     }
++    return true;
+ }
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
++static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                              TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+     init_ts_info(ctx, tv);
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+ }
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+--
+.25.1

-New patch
+[PULL 17/56] tcg/optimize: Split out finish_folding
+Copy z_mask into OptContext, for writeback to the
+first output within the new function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
+file changed, 33 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     TCGContext *tcg;
+     TCGOp *prev_mb;
+     TCGTempSet temps_used;
++
++    /* In flight values from optimization. */
++    uint64_t z_mask;
+ } OptContext;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static void finish_folding(OptContext *ctx, TCGOp *op)
++{
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
++    int i, nb_oargs;
++
++    /*
++     * For an opcode that ends a BB, reset all temp data.
++     * We do no cross-BB optimization.
++     */
++    if (def->flags & TCG_OPF_BB_END) {
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
++        ctx->prev_mb = NULL;
++        return;
++    }
++
++    nb_oargs = def->nb_oargs;
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++        /*
++         * Save the corresponding known-zero bits mask for the
++         * first output argument (only one supported so far).
++         */
++        if (i == 0) {
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
++        }
++    }
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             partmask &= 0xffffffffu;
+             affected &= 0xffffffffu;
+         }
++        ctx.z_mask = z_mask;
+         if (partmask == 0) {
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Some of the folding above can change opc. */
+-        opc = op->opc;
+-        def = &tcg_op_defs[opc];
+-        if (def->flags & TCG_OPF_BB_END) {
+-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-        } else {
+-            int nb_oargs = def->nb_oargs;
+-            for (i = 0; i < nb_oargs; i++) {
+-                reset_temp(op->args[i]);
+-                /* Save the corresponding known-zero bits mask for the
+-                   first output argument (only one supported so far). */
+-                if (i == 0) {
+-                    arg_info(op->args[i])->z_mask = z_mask;
+-                }
+-            }
+-        }
++        finish_folding(&ctx, op);
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-New patch
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++++++---
+file changed, 6 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
++        bool done = false;
+         /* Calls are special. */
+         if (opc == INDEX_op_call) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+            allocator where needed and possible.  Also detect copies. */
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
++            break;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        finish_folding(&ctx, op);
++        if (!done) {
++            finish_folding(&ctx, op);
++        }
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-New patch
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+This puts the separate mb optimization into the same framework
+as the others.  While fold_qemu_{ld,st} are currently identical,
+that won't last as more code gets moved.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
+file changed, 51 insertions(+), 38 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mb(OptContext *ctx, TCGOp *op)
++{
++    /* Eliminate duplicate and redundant fence instructions.  */
++    if (ctx->prev_mb) {
++        /*
++         * Merge two barriers of the same type into one,
++         * or a weaker barrier into a stronger one,
++         * or two weaker barriers into a stronger one.
++         *   mb X; mb Y => mb X|Y
++         *   mb; strl => mb; st
++         *   ldaq; mb => ld; mb
++         *   ldaq; strl => ld; mb; st
++         * Other combinations are also merged into a strong
++         * barrier.  This is stricter than specified but for
++         * the purposes of TCG is better than not optimizing.
++         */
++        ctx->prev_mb->args[0] |= op->args[0];
++        tcg_op_remove(ctx->tcg, op);
++    } else {
++        ctx->prev_mb = op;
++    }
++    return true;
++}
++
++static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
++static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
++        case INDEX_op_mb:
++            done = fold_mb(&ctx, op);
++            break;
++        case INDEX_op_qemu_ld_i32:
++        case INDEX_op_qemu_ld_i64:
++            done = fold_qemu_ld(&ctx, op);
++            break;
++        case INDEX_op_qemu_st_i32:
++        case INDEX_op_qemu_st8_i32:
++        case INDEX_op_qemu_st_i64:
++            done = fold_qemu_st(&ctx, op);
++            break;
++
+         default:
+             break;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (!done) {
+             finish_folding(&ctx, op);
+         }
+-
+-        /* Eliminate duplicate and redundant fence instructions.  */
+-        if (ctx.prev_mb) {
+-            switch (opc) {
+-            case INDEX_op_mb:
+-                /* Merge two barriers of the same type into one,
+-                 * or a weaker barrier into a stronger one,
+-                 * or two weaker barriers into a stronger one.
+-                 *   mb X; mb Y => mb X|Y
+-                 *   mb; strl => mb; st
+-                 *   ldaq; mb => ld; mb
+-                 *   ldaq; strl => ld; mb; st
+-                 * Other combinations are also merged into a strong
+-                 * barrier.  This is stricter than specified but for
+-                 * the purposes of TCG is better than not optimizing.
+-                 */
+-                ctx.prev_mb->args[0] |= op->args[0];
+-                tcg_op_remove(s, op);
+-                break;
+-
+-            default:
+-                /* Opcodes that end the block stop the optimization.  */
+-                if ((def->flags & TCG_OPF_BB_END) == 0) {
+-                    break;
+-                }
+-                /* fallthru */
+-            case INDEX_op_qemu_ld_i32:
+-            case INDEX_op_qemu_ld_i64:
+-            case INDEX_op_qemu_st_i32:
+-            case INDEX_op_qemu_st8_i32:
+-            case INDEX_op_qemu_st_i64:
+-                /* Opcodes that touch guest memory stop the optimization.  */
+-                ctx.prev_mb = NULL;
+-                break;
+-            }
+-        } else if (opc == INDEX_op_mb) {
+-            ctx.prev_mb = op;
+-        }
+     }
+ }
+--
+.25.1

-[PULL 03/15] tcg/arm: Implement tcg_out_ld/st for vector types
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Split out a whole bunch of placeholder functions, which are
 currently identical.  That won't last as more code gets moved.
 Use CASE_32_64_VEC for some logical operators that previously
 missed the addition of vectors.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.c.inc | 70 ++++++++++++++++++++++++++++++++++++----
+ tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
-file changed, 64 insertions(+), 6 deletions(-)
+file changed, 219 insertions(+), 52 deletions(-)
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      INSN_NOP_v6k   = 0xe320f000,
      /* Otherwise the assembler uses mov r0,r0 */
      INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
 +
 +    INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
 +    INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
  } ARMInsn;
  #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
@@ -XXX,XX +XXX,XX @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
      }
  }
 +/*
-+ * Note that TCGReg references Q-registers.
++ * The fold_* functions return true when processing is complete,
-+ * Q-regno = 2 * D-regno, so shift left by 1 whlie inserting.
++ * usually by folding the operation to a constant or to a copy,
 + * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
 + * like collect information about the value produced, for use in
 + * optimizing a subsequent operation.
 + *
 + * These first fold_* functions are all helpers, used by other
 + * folders for more specific operations.
 + */
-+static uint32_t encode_vd(TCGReg rd)
++
-+{
++static bool fold_const1(OptContext *ctx, TCGOp *op)
-+    tcg_debug_assert(rd >= TCG_REG_Q0);
++{
-+    return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
++    if (arg_is_const(op->args[1])) {
-+}
++        uint64_t t;
 +
-+static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
++        t = arg_info(op->args[1])->val;
-+                          TCGReg rd, TCGReg rn, int offset)
++        t = do_constant_folding(op->opc, t, 0);
-+{
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    if (offset != 0) {
 +        if (check_fit_imm(offset) || check_fit_imm(-offset)) {
 +            tcg_out_dat_rIN(s, COND_AL, ARITH_ADD, ARITH_SUB,
 +                            TCG_REG_TMP, rn, offset, true);
 +        } else {
 +            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, offset);
 +            tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
 +                            TCG_REG_TMP, TCG_REG_TMP, rn, 0);
 +        }
 +        rn = TCG_REG_TMP;
 +    }
-+    tcg_out32(s, insn | (rn << 16) | encode_vd(rd) | 0xf);
++    return false;
 +}
 +
- #ifdef CONFIG_SOFTMMU
++static bool fold_const2(OptContext *ctx, TCGOp *op)
- #include "../tcg-ldst.c.inc"
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
++        uint64_t t1 = arg_info(op->args[1])->val;
-     tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
++        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = do_constant_folding(op->opc, t1, t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
 +/*
 + * These outermost fold_<op> functions are sorted alphabetically.
 + */
 +
 +static bool fold_add(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
      return true;
  }
--static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
++static bool fold_ctpop(OptContext *ctx, TCGOp *op)
--                              TCGReg arg1, intptr_t arg2)
++{
-+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
++    return fold_const1(ctx, op);
-+                       TCGReg arg1, intptr_t arg2)
++}
 +
 +static bool fold_divide(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_eqv(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_exts(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_extu(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
  static bool fold_mb(OptContext *ctx, TCGOp *op)
  {
--    tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
+     /* Eliminate duplicate and redundant fence instructions.  */
-+    switch (type) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-+    case TCG_TYPE_I32:
+     return true;
 +        tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
 +        return;
 +    case TCG_TYPE_V64:
 +        /* regs 1; size 8; align 8 */
 +        tcg_out_vldst(s, INSN_VLD1 | 0x7d0, arg, arg1, arg2);
 +        return;
 +    case TCG_TYPE_V128:
 +        /* regs 2; size 8; align 16 */
 +        tcg_out_vldst(s, INSN_VLD1 | 0xae0, arg, arg1, arg2);
 +        return;
 +    default:
 +        g_assert_not_reached();
 +    }
  }
--static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
++static bool fold_mul(OptContext *ctx, TCGOp *op)
--                              TCGReg arg1, intptr_t arg2)
++{
-+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
++    return fold_const2(ctx, op);
-+                       TCGReg arg1, intptr_t arg2)
++}
 +
 +static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_nand(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_neg(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
--    tcg_out_st32(s, COND_AL, arg, arg1, arg2);
+     /* Opcodes that touch guest memory stop the mb optimization.  */
-+    switch (type) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
-+    case TCG_TYPE_I32:
+     return false;
 +        tcg_out_st32(s, COND_AL, arg, arg1, arg2);
 +        return;
 +    case TCG_TYPE_V64:
 +        /* regs 1; size 8; align 8 */
 +        tcg_out_vldst(s, INSN_VST1 | 0x7d0, arg, arg1, arg2);
 +        return;
 +    case TCG_TYPE_V128:
 +        /* regs 2; size 8; align 16 */
 +        tcg_out_vldst(s, INSN_VST1 | 0xae0, arg, arg1, arg2);
 +        return;
 +    default:
 +        g_assert_not_reached();
 +    }
  }
- static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
++static bool fold_remainder(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_shift(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_sub(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_xor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(not):
 -        CASE_OP_32_64(neg):
 -        CASE_OP_32_64(ext8s):
 -        CASE_OP_32_64(ext8u):
 -        CASE_OP_32_64(ext16s):
 -        CASE_OP_32_64(ext16u):
 -        CASE_OP_32_64(ctpop):
 -        case INDEX_op_ext32s_i64:
 -        case INDEX_op_ext32u_i64:
 -        case INDEX_op_ext_i32_i64:
 -        case INDEX_op_extu_i32_i64:
 -        case INDEX_op_extrl_i64_i32:
 -        case INDEX_op_extrh_i64_i32:
 -            if (arg_is_const(op->args[1])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(bswap16):
          CASE_OP_32_64(bswap32):
          case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(add):
 -        CASE_OP_32_64(sub):
 -        CASE_OP_32_64(mul):
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(and):
 -        CASE_OP_32_64(xor):
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -        CASE_OP_32_64(andc):
 -        CASE_OP_32_64(orc):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -        CASE_OP_32_64(div):
 -        CASE_OP_32_64(divu):
 -        CASE_OP_32_64(rem):
 -        CASE_OP_32_64(remu):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
 -                                          arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 +        default:
 +            break;
 +
 +        /* ---------------------------------------------------------- */
 +        /* Sorted alphabetically by opcode as much as possible. */
 +
 +        CASE_OP_32_64_VEC(add):
 +            done = fold_add(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(and):
 +            done = fold_and(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(andc):
 +            done = fold_andc(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ctpop):
 +            done = fold_ctpop(&ctx, op);
 +            break;
 +        CASE_OP_32_64(div):
 +        CASE_OP_32_64(divu):
 +            done = fold_divide(&ctx, op);
 +            break;
 +        CASE_OP_32_64(eqv):
 +            done = fold_eqv(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8s):
 +        CASE_OP_32_64(ext16s):
 +        case INDEX_op_ext32s_i64:
 +        case INDEX_op_ext_i32_i64:
 +            done = fold_exts(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8u):
 +        CASE_OP_32_64(ext16u):
 +        case INDEX_op_ext32u_i64:
 +        case INDEX_op_extu_i32_i64:
 +        case INDEX_op_extrl_i64_i32:
 +        case INDEX_op_extrh_i64_i32:
 +            done = fold_extu(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64(mul):
 +            done = fold_mul(&ctx, op);
 +            break;
 +        CASE_OP_32_64(mulsh):
 +        CASE_OP_32_64(muluh):
 +            done = fold_mul_highpart(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nand):
 +            done = fold_nand(&ctx, op);
 +            break;
 +        CASE_OP_32_64(neg):
 +            done = fold_neg(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nor):
 +            done = fold_nor(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(not):
 +            done = fold_not(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(or):
 +            done = fold_or(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(orc):
 +            done = fold_orc(&ctx, op);
 +            break;
          case INDEX_op_qemu_ld_i32:
          case INDEX_op_qemu_ld_i64:
              done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_qemu_st_i64:
              done = fold_qemu_st(&ctx, op);
              break;
 -
 -        default:
 +        CASE_OP_32_64(rem):
 +        CASE_OP_32_64(remu):
 +            done = fold_remainder(&ctx, op);
 +            break;
 +        CASE_OP_32_64(rotl):
 +        CASE_OP_32_64(rotr):
 +        CASE_OP_32_64(sar):
 +        CASE_OP_32_64(shl):
 +        CASE_OP_32_64(shr):
 +            done = fold_shift(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(sub):
 +            done = fold_sub(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(xor):
 +            done = fold_xor(&ctx, op);
              break;
          }
 --
 .25.1

-[PULL 15/15] tcg/arm: Implement TCG_TARGET_HAS_rotv_vec
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
-Implement via expansion, so don't actually set TCG_TARGET_HAS_rotv_vec.
+Reduce some code duplication by folding the NE and EQ cases.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.c.inc | 35 ++++++++++++++++++++++++++++++++++-
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
-file changed, 34 insertions(+), 1 deletion(-)
+file changed, 72 insertions(+), 73 deletions(-)
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
-     case INDEX_op_shrv_vec:
+     return fold_const2(ctx, op);
-     case INDEX_op_sarv_vec:
+ }
-     case INDEX_op_rotli_vec:
-+    case INDEX_op_rotlv_vec:
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-+    case INDEX_op_rotrv_vec:
++{
-         return -1;
++    TCGCond cond = op->args[5];
-     default:
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-         return 0;
++    int inv = 0;
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
++
-                        TCGArg a0, ...)
++    if (i >= 0) {
- {
++        goto do_setcond_const;
-     va_list va;
++    }
--    TCGv_vec v0, v1, v2, t1;
++
-+    TCGv_vec v0, v1, v2, t1, t2, c1;
++    switch (cond) {
-     TCGArg a2;
++    case TCG_COND_LT:
++    case TCG_COND_GE:
-     va_start(va, a0);
++        /*
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
++         * Simplify LT/GE comparisons vs zero to a single compare
-         tcg_temp_free_vec(t1);
++         * vs the high word of the input.
-         break;
++         */
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
-+    case INDEX_op_rotlv_vec:
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
-+        v2 = temp_tcgv_vec(arg_temp(a2));
++            goto do_setcond_high;
-+        t1 = tcg_temp_new_vec(type);
++        }
 +        c1 = tcg_constant_vec(type, vece, 8 << vece);
 +        tcg_gen_sub_vec(vece, t1, v2, c1);
 +        /* Right shifts are negative left shifts for NEON.  */
 +        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
 +                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
 +        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
 +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
 +        tcg_gen_or_vec(vece, v0, v0, t1);
 +        tcg_temp_free_vec(t1);
 +        break;
 +
-+    case INDEX_op_rotrv_vec:
++    case TCG_COND_NE:
-+        v2 = temp_tcgv_vec(arg_temp(a2));
++        inv = 1;
-+        t1 = tcg_temp_new_vec(type);
++        QEMU_FALLTHROUGH;
-+        t2 = tcg_temp_new_vec(type);
++    case TCG_COND_EQ:
-+        c1 = tcg_constant_vec(type, vece, 8 << vece);
++        /*
-+        tcg_gen_neg_vec(vece, t1, v2);
++         * Simplify EQ/NE comparisons where one of the pairs
-+        tcg_gen_sub_vec(vece, t2, c1, v2);
++         * can be simplified.
-+        /* Right shifts are negative left shifts for NEON.  */
++         */
-+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
-+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
++                                     op->args[3], cond);
-+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t2),
++        switch (i ^ inv) {
-+                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
++        case 0:
-+        tcg_gen_or_vec(vece, v0, t1, t2);
++            goto do_setcond_const;
-+        tcg_temp_free_vec(t1);
++        case 1:
-+        tcg_temp_free_vec(t2);
++            goto do_setcond_high;
 +        }
 +
 +        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +                                     op->args[4], cond);
 +        switch (i ^ inv) {
 +        case 0:
 +            goto do_setcond_const;
 +        case 1:
 +            op->args[2] = op->args[3];
 +            op->args[3] = cond;
 +            op->opc = INDEX_op_setcond_i32;
 +            break;
 +        }
 +        break;
 +
-     default:
++    default:
-         g_assert_not_reached();
++        break;
-     }
++
 +    do_setcond_high:
 +        op->args[1] = op->args[2];
 +        op->args[2] = op->args[4];
 +        op->args[3] = cond;
 +        op->opc = INDEX_op_setcond_i32;
 +        break;
 +    }
 +    return false;
 +
 + do_setcond_const:
 +    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +}
 +
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        case INDEX_op_setcond2_i32:
 -            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                          op->args[5]);
 -            if (i >= 0) {
 -            do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
 -                continue;
 -            }
 -            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 -                 && arg_is_const(op->args[3])
 -                 && arg_info(op->args[3])->val == 0
 -                 && arg_is_const(op->args[4])
 -                 && arg_info(op->args[4])->val == 0) {
 -                /* Simplify LT/GE comparisons vs zero to a single compare
 -                   vs the high word of the input.  */
 -            do_setcond_high:
 -                reset_temp(op->args[0]);
 -                arg_info(op->args[0])->z_mask = 1;
 -                op->opc = INDEX_op_setcond_i32;
 -                op->args[1] = op->args[2];
 -                op->args[2] = op->args[4];
 -                op->args[3] = op->args[5];
 -                break;
 -            }
 -            if (op->args[5] == TCG_COND_EQ) {
 -                /* Simplify EQ comparisons where one of the pairs
 -                   can be simplified.  */
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[1], op->args[3],
 -                                             TCG_COND_EQ);
 -                if (i == 0) {
 -                    goto do_setcond_const;
 -                } else if (i > 0) {
 -                    goto do_setcond_high;
 -                }
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[2], op->args[4],
 -                                             TCG_COND_EQ);
 -                if (i == 0) {
 -                    goto do_setcond_high;
 -                } else if (i < 0) {
 -                    break;
 -                }
 -            do_setcond_low:
 -                reset_temp(op->args[0]);
 -                arg_info(op->args[0])->z_mask = 1;
 -                op->opc = INDEX_op_setcond_i32;
 -                op->args[2] = op->args[3];
 -                op->args[3] = op->args[5];
 -                break;
 -            }
 -            if (op->args[5] == TCG_COND_NE) {
 -                /* Simplify NE comparisons where one of the pairs
 -                   can be simplified.  */
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[1], op->args[3],
 -                                             TCG_COND_NE);
 -                if (i == 0) {
 -                    goto do_setcond_high;
 -                } else if (i > 0) {
 -                    goto do_setcond_const;
 -                }
 -                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                             op->args[2], op->args[4],
 -                                             TCG_COND_NE);
 -                if (i == 0) {
 -                    goto do_setcond_low;
 -                } else if (i > 0) {
 -                    goto do_setcond_const;
 -                }
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(shr):
              done = fold_shift(&ctx, op);
              break;
 +        case INDEX_op_setcond2_i32:
 +            done = fold_setcond2(&ctx, op);
 +            break;
          CASE_OP_32_64_VEC(sub):
              done = fold_sub(&ctx, op);
              break;
 --
 .25.1

-[PULL 13/15] tcg/arm: Implement TCG_TARGET_HAS_shv_vec
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
-The three vector shift by vector operations are all implemented via
+Reduce some code duplication by folding the NE and EQ cases.
 expansion.  Therefore do not actually set TCG_TARGET_HAS_shv_vec,
 as none of shlv_vec, shrv_vec, sarv_vec may actually appear in the
 instruction stream, and therefore also do not appear in tcg_target_op_def.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.opc.h |  3 ++
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
- tcg/arm/tcg-target.c.inc | 61 +++++++++++++++++++++++++++++++++++++++-
+file changed, 81 insertions(+), 78 deletions(-)
 files changed, 63 insertions(+), 1 deletion(-)
-diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.opc.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.opc.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-  * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+     return fold_const2(ctx, op);
-  * consider these to be UNSPEC with names.
+ }
-  */
 +static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 +{
 +    TCGCond cond = op->args[4];
 +    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
 +    TCGArg label = op->args[5];
 +    int inv = 0;
 +
-+DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC)
++    if (i >= 0) {
-+DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC)
++        goto do_brcond_const;
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++    }
-index XXXXXXX..XXXXXXX 100644
++
---- a/tcg/arm/tcg-target.c.inc
++    switch (cond) {
-+++ b/tcg/arm/tcg-target.c.inc
++    case TCG_COND_LT:
-@@ -XXX,XX +XXX,XX @@ typedef enum {
++    case TCG_COND_GE:
      INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
      INSN_VSARI     = 0xf2800010,  /* VSHR.S */
      INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
 +    INSN_VSHL_S    = 0xf2000400,  /* VSHL.S (register) */
 +    INSN_VSHL_U    = 0xf3000400,  /* VSHL.U (register) */
      INSN_VBSL      = 0xf3100110,
      INSN_VBIT      = 0xf3200110,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_usadd_vec:
      case INDEX_op_ussub_vec:
      case INDEX_op_xor_vec:
 +    case INDEX_op_arm_sshl_vec:
 +    case INDEX_op_arm_ushl_vec:
          return C_O1_I2(w, w, w);
      case INDEX_op_or_vec:
      case INDEX_op_andc_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_xor_vec:
          tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
          return;
 +    case INDEX_op_arm_sshl_vec:
 +        /*
-+         * Note that Vm is the data and Vn is the shift count,
++         * Simplify LT/GE comparisons vs zero to a single compare
-+         * therefore the arguments appear reversed.
++         * vs the high word of the input.
 +         */
-+        tcg_out_vreg3(s, INSN_VSHL_S, q, vece, a0, a2, a1);
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
-+        return;
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
-+    case INDEX_op_arm_ushl_vec:
++            goto do_brcond_high;
-+        /* See above. */
++        }
 +        tcg_out_vreg3(s, INSN_VSHL_U, q, vece, a0, a2, a1);
 +        return;
      case INDEX_op_shli_vec:
          tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
          return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
      case INDEX_op_umax_vec:
      case INDEX_op_umin_vec:
          return vece < MO_64;
 +    case INDEX_op_shlv_vec:
 +    case INDEX_op_shrv_vec:
 +    case INDEX_op_sarv_vec:
 +        return -1;
      default:
          return 0;
      }
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
  void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                         TCGArg a0, ...)
  {
 -    g_assert_not_reached();
 +    va_list va;
 +    TCGv_vec v0, v1, v2, t1;
 +    TCGArg a2;
 +
 +    va_start(va, a0);
 +    v0 = temp_tcgv_vec(arg_temp(a0));
 +    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
 +    a2 = va_arg(va, TCGArg);
 +    va_end(va);
 +
 +    switch (opc) {
 +    case INDEX_op_shlv_vec:
 +        /*
 +         * Merely propagate shlv_vec to arm_ushl_vec.
 +         * In this way we don't set TCG_TARGET_HAS_shv_vec
 +         * because everything is done via expansion.
 +         */
 +        v2 = temp_tcgv_vec(arg_temp(a2));
 +        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
 +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
 +        break;
 +
-+    case INDEX_op_shrv_vec:
++    case TCG_COND_NE:
-+    case INDEX_op_sarv_vec:
++        inv = 1;
-+        /* Right shifts are negative left shifts for NEON.  */
++        QEMU_FALLTHROUGH;
-+        v2 = temp_tcgv_vec(arg_temp(a2));
++    case TCG_COND_EQ:
-+        t1 = tcg_temp_new_vec(type);
++        /*
-+        tcg_gen_neg_vec(vece, t1, v2);
++         * Simplify EQ/NE comparisons where one of the pairs
-+        if (opc == INDEX_op_shrv_vec) {
++         * can be simplified.
-+            opc = INDEX_op_arm_ushl_vec;
++         */
-+        } else {
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
-+            opc = INDEX_op_arm_sshl_vec;
++                                     op->args[2], cond);
 +        switch (i ^ inv) {
 +        case 0:
 +            goto do_brcond_const;
 +        case 1:
 +            goto do_brcond_high;
 +        }
-+        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
++
-+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
-+        tcg_temp_free_vec(t1);
++                                     op->args[3], cond);
 +        switch (i ^ inv) {
 +        case 0:
 +            goto do_brcond_const;
 +        case 1:
 +            op->opc = INDEX_op_brcond_i32;
 +            op->args[1] = op->args[2];
 +            op->args[2] = cond;
 +            op->args[3] = label;
 +            break;
 +        }
 +        break;
 +
 +    default:
-+        g_assert_not_reached();
++        break;
 +
 +    do_brcond_high:
 +        op->opc = INDEX_op_brcond_i32;
 +        op->args[0] = op->args[1];
 +        op->args[1] = op->args[3];
 +        op->args[2] = cond;
 +        op->args[3] = label;
 +        break;
 +
 +    do_brcond_const:
 +        if (i == 0) {
 +            tcg_op_remove(ctx->tcg, op);
 +            return true;
 +        }
 +        op->opc = INDEX_op_br;
 +        op->args[0] = label;
 +        break;
 +    }
- }
++    return false;
++}
- static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
++
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        case INDEX_op_brcond2_i32:
 -            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
 -                                          op->args[4]);
 -            if (i == 0) {
 -            do_brcond_false:
 -                tcg_op_remove(s, op);
 -                continue;
 -            }
 -            if (i > 0) {
 -            do_brcond_true:
 -                op->opc = opc = INDEX_op_br;
 -                op->args[0] = op->args[5];
 -                break;
 -            }
 -            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
 -                 && arg_is_const(op->args[2])
 -                 && arg_info(op->args[2])->val == 0
 -                 && arg_is_const(op->args[3])
 -                 && arg_info(op->args[3])->val == 0) {
 -                /* Simplify LT/GE comparisons vs zero to a single compare
 -                   vs the high word of the input.  */
 -            do_brcond_high:
 -                op->opc = opc = INDEX_op_brcond_i32;
 -                op->args[0] = op->args[1];
 -                op->args[1] = op->args[3];
 -                op->args[2] = op->args[4];
 -                op->args[3] = op->args[5];
 -                break;
 -            }
 -            if (op->args[4] == TCG_COND_EQ) {
 -                /* Simplify EQ comparisons where one of the pairs
 -                   can be simplified.  */
 -                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                             op->args[0], op->args[2],
 -                                             TCG_COND_EQ);
 -                if (i == 0) {
 -                    goto do_brcond_false;
 -                } else if (i > 0) {
 -                    goto do_brcond_high;
 -                }
 -                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                             op->args[1], op->args[3],
 -                                             TCG_COND_EQ);
 -                if (i == 0) {
 -                    goto do_brcond_false;
 -                } else if (i < 0) {
 -                    break;
 -                }
 -            do_brcond_low:
 -                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                op->opc = INDEX_op_brcond_i32;
 -                op->args[1] = op->args[2];
 -                op->args[2] = op->args[4];
 -                op->args[3] = op->args[5];
 -                break;
 -            }
 -            if (op->args[4] == TCG_COND_NE) {
 -                /* Simplify NE comparisons where one of the pairs
 -                   can be simplified.  */
 -                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                             op->args[0], op->args[2],
 -                                             TCG_COND_NE);
 -                if (i == 0) {
 -                    goto do_brcond_high;
 -                } else if (i > 0) {
 -                    goto do_brcond_true;
 -                }
 -                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                             op->args[1], op->args[3],
 -                                             TCG_COND_NE);
 -                if (i == 0) {
 -                    goto do_brcond_low;
 -                } else if (i > 0) {
 -                    goto do_brcond_true;
 -                }
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(andc):
              done = fold_andc(&ctx, op);
              break;
 +        case INDEX_op_brcond2_i32:
 +            done = fold_brcond2(&ctx, op);
 +            break;
          CASE_OP_32_64(ctpop):
              done = fold_ctpop(&ctx, op);
              break;
 --
 .25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
+file changed, 21 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
++        uint32_t a = arg_info(op->args[2])->val;
++        uint32_t b = arg_info(op->args[3])->val;
++        uint64_t r = (uint64_t)a * b;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++        return true;
++    }
++    return false;
++}
++
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_mulu2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-                uint32_t a = arg_info(op->args[2])->val;
+-                uint32_t b = arg_info(op->args[3])->val;
+-                uint64_t r = (uint64_t)a * b;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
++        case INDEX_op_mulu2_i32:
++            done = fold_mulu2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
+which will not be simple wrappers forever.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
+file changed, 44 insertions(+), 26 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
++        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
++        uint32_t al = arg_info(op->args[2])->val;
++        uint32_t ah = arg_info(op->args[3])->val;
++        uint32_t bl = arg_info(op->args[4])->val;
++        uint32_t bh = arg_info(op->args[5])->val;
++        uint64_t a = ((uint64_t)ah << 32) | al;
++        uint64_t b = ((uint64_t)bh << 32) | bl;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        if (add) {
++            a += b;
++        } else {
++            a -= b;
++        }
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
++        return true;
++    }
++    return false;
++}
++
++static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, true);
++}
++
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, false);
++}
++
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_add2_i32:
+-        case INDEX_op_sub2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+-                uint32_t al = arg_info(op->args[2])->val;
+-                uint32_t ah = arg_info(op->args[3])->val;
+-                uint32_t bl = arg_info(op->args[4])->val;
+-                uint32_t bh = arg_info(op->args[5])->val;
+-                uint64_t a = ((uint64_t)ah << 32) | al;
+-                uint64_t b = ((uint64_t)bh << 32) | bl;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                if (opc == INDEX_op_add2_i32) {
+-                    a += b;
+-                } else {
+-                    a -= b;
+-                }
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
+-                continue;
+-            }
+-            break;
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
++        case INDEX_op_add2_i32:
++            done = fold_add2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
++        case INDEX_op_sub2_i32:
++            done = fold_sub2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 27/56] tcg/optimize: Split out fold_movcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
+file changed, 31 insertions(+), 25 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_movcond(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode opc = op->opc;
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
++    }
++
++    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
++        uint64_t tv = arg_info(op->args[3])->val;
++        uint64_t fv = arg_info(op->args[4])->val;
++
++        opc = (opc == INDEX_op_movcond_i32
++               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
++
++        if (tv == 1 && fv == 0) {
++            op->opc = opc;
++            op->args[3] = cond;
++        } else if (fv == 1 && tv == 0) {
++            op->opc = opc;
++            op->args[3] = tcg_invert_cond(cond);
++        }
++    }
++    return false;
++}
++
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(movcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[5]);
+-            if (i >= 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
+-                continue;
+-            }
+-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-                uint64_t tv = arg_info(op->args[3])->val;
+-                uint64_t fv = arg_info(op->args[4])->val;
+-                TCGCond cond = op->args[5];
+-
+-                if (fv == 1 && tv == 0) {
+-                    cond = tcg_invert_cond(cond);
+-                } else if (!(tv == 1 && fv == 0)) {
+-                    break;
+-                }
+-                op->args[3] = cond;
+-                op->opc = opc = (opc == INDEX_op_movcond_i32
+-                                 ? INDEX_op_setcond_i32
+-                                 : INDEX_op_setcond_i64);
+-            }
+-            break;
+-
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(movcond):
++            done = fold_movcond(&ctx, op);
++            break;
+         CASE_OP_32_64(mul):
+             done = fold_mul(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 28/56] tcg/optimize: Split out fold_extract2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
+file changed, 22 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t v1 = arg_info(op->args[1])->val;
++        uint64_t v2 = arg_info(op->args[2])->val;
++        int shr = op->args[3];
++
++        if (op->opc == INDEX_op_extract2_i64) {
++            v1 >>= shr;
++            v2 <<= 64 - shr;
++        } else {
++            v1 = (uint32_t)v1 >> shr;
++            v2 = (int32_t)v2 << (32 - shr);
++        }
++        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
++    }
++    return false;
++}
++
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract2):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                uint64_t v1 = arg_info(op->args[1])->val;
+-                uint64_t v2 = arg_info(op->args[2])->val;
+-                int shr = op->args[3];
+-
+-                if (opc == INDEX_op_extract2_i64) {
+-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
+-                } else {
+-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+-                                    ((uint32_t)v2 << (32 - shr)));
+-                }
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract2):
++            done = fold_extract2(&ctx, op);
++            break;
+         CASE_OP_32_64(ext8s):
+         CASE_OP_32_64(ext16s):
+         case INDEX_op_ext32s_i64:
+--
+.25.1

-New patch
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
+file changed, 30 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = extract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ {
+     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+ }
++static bool fold_sextract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = sextract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = extract64(arg_info(op->args[1])->val,
+-                                op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        CASE_OP_32_64(sextract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = sextract64(arg_info(op->args[1])->val,
+-                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract):
++            done = fold_extract(&ctx, op);
++            break;
+         CASE_OP_32_64(extract2):
+             done = fold_extract2(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(sextract):
++            done = fold_sextract(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 30/56] tcg/optimize: Split out fold_deposit
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 25 +++++++++++++++----------
+file changed, 15 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+     return fold_const1(ctx, op);
+ }
++static bool fold_deposit(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t1 = arg_info(op->args[1])->val;
++        uint64_t t2 = arg_info(op->args[2])->val;
++
++        t1 = deposit64(t1, op->args[3], op->args[4], t2);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
++    }
++    return false;
++}
++
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(deposit):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tmp = deposit64(arg_info(op->args[1])->val,
+-                                op->args[3], op->args[4],
+-                                arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
++        CASE_OP_32_64(deposit):
++            done = fold_deposit(&ctx, op);
++            break;
+         CASE_OP_32_64(div):
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+--
+.25.1

-New patch
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
+file changed, 18 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        if (t != 0) {
++            t = do_constant_folding(op->opc, t, 0);
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++        }
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
++    }
++    return false;
++}
++
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(clz):
+-        CASE_OP_32_64(ctz):
+-            if (arg_is_const(op->args[1])) {
+-                TCGArg v = arg_info(op->args[1])->val;
+-                if (v != 0) {
+-                    tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                } else {
+-                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
+-                }
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(clz):
++        CASE_OP_32_64(ctz):
++            done = fold_count_zeros(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 32/56] tcg/optimize: Split out fold_bswap
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++++-----------
+file changed, 16 insertions(+), 11 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++static bool fold_bswap(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        t = do_constant_folding(op->opc, t, op->args[2]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(bswap16):
+-        CASE_OP_32_64(bswap32):
+-        case INDEX_op_bswap64_i64:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          op->args[2]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(bswap16):
++        CASE_OP_32_64(bswap32):
++        case INDEX_op_bswap64_i64:
++            done = fold_bswap(&ctx, op);
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             done = fold_count_zeros(&ctx, op);
+--
+.25.1

-New patch
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
+file changed, 31 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_dup(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++        t = dup_const(TCGOP_VECE(op), t);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
++static bool fold_dup2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
++                               arg_info(op->args[2])->val);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++
++    if (args_are_copies(op->args[1], op->args[2])) {
++        op->opc = INDEX_op_dup_vec;
++        TCGOP_VECE(op) = MO_32;
++    }
++    return false;
++}
++
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             break;
+-        case INDEX_op_dup_vec:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = arg_info(op->args[1])->val;
+-                tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        case INDEX_op_dup2_vec:
+-            assert(TCG_TARGET_REG_BITS == 32);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0],
+-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+-                                           arg_info(op->args[2])->val));
+-                continue;
+-            } else if (args_are_copies(op->args[1], op->args[2])) {
+-                op->opc = INDEX_op_dup_vec;
+-                TCGOP_VECE(op) = MO_32;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+             break;
++        case INDEX_op_dup_vec:
++            done = fold_dup(&ctx, op);
++            break;
++        case INDEX_op_dup2_vec:
++            done = fold_dup2(&ctx, op);
++            break;
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 34/56] tcg/optimize: Split out fold_mov
+This is the final entry in the main switch that was in a
+different form.  After this, we have the option to convert
+the switch into a function dispatch table.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++-------------
+file changed, 14 insertions(+), 13 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mov(OptContext *ctx, TCGOp *op)
++{
++    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++}
++
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ {
+     TCGOpcode opc = op->opc;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Propagate constants through copy operations and do constant
+-           folding.  Constants will be substituted to arguments by register
+-           allocator where needed and possible.  Also detect copies. */
++        /*
++         * Process each opcode.
++         * Sorted alphabetically by opcode as much as possible.
++         */
+         switch (opc) {
+-        CASE_OP_32_64_VEC(mov):
+-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            break;
+-
+-        default:
+-            break;
+-
+-        /* ---------------------------------------------------------- */
+-        /* Sorted alphabetically by opcode as much as possible. */
+-
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64_VEC(mov):
++            done = fold_mov(&ctx, op);
++            break;
+         CASE_OP_32_64(movcond):
+             done = fold_movcond(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
++        default:
++            break;
+         }
+         if (!done) {
+--
+.25.1

-[PULL 05/15] tcg/arm: Implement tcg_out_dup*_vec
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
-Most of dupi is copied from tcg/aarch64, which has the same
+Pull the "op r, a, a => movi r, 0" optimization into a function,
-encoding for AdvSimdExpandImm.
+and use it in the outer opcode fold functions.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.c.inc | 283 +++++++++++++++++++++++++++++++++++++--
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
-file changed, 275 insertions(+), 8 deletions(-)
+file changed, 24 insertions(+), 17 deletions(-)
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      INSN_VORR      = 0xf2200110,
 +    INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
 +    INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
 +    INSN_VLDR_D    = 0xed100b00,  /* VLDR.64 */
      INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
 +    INSN_VLD1R     = 0xf4a00c00,  /* VLD1 (single element to all lanes) */
      INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
 +    INSN_VMOVI     = 0xf2800010,  /* VMOV (immediate) */
  } ARMInsn;
  #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
@@ -XXX,XX +XXX,XX @@ static const uint8_t tcg_cond_to_arm_cond[] = {
      [TCG_COND_GTU] = COND_HI,
  };
 +static int encode_imm(uint32_t imm);
 +
 +/* TCG private relocation type: add with pc+imm8 */
 +#define R_ARM_PC8  11
 +
 +/* TCG private relocation type: vldr with imm8 << 2 */
 +#define R_ARM_PC11 12
 +
  static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  {
      const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc13(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
      return false;
  }
-+static bool reloc_pc11(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
++/* If the binary operation has both arguments equal, fold to @i. */
 +static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
-+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
++    if (args_are_copies(op->args[1], op->args[2])) {
-+    ptrdiff_t offset = (tcg_ptr_byte_diff(target, src_rx) - 8) / 4;
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +
 +    if (offset >= -0xff && offset <= 0xff) {
 +        tcg_insn_unit insn = *src_rw;
 +        bool u = (offset >= 0);
 +        if (!u) {
 +            offset = -offset;
 +        }
 +        insn = deposit32(insn, 23, 1, u);
 +        insn = deposit32(insn, 0, 8, offset);
 +        *src_rw = insn;
 +        return true;
 +    }
 +    return false;
 +}
 +
-+static bool reloc_pc8(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+ /*
-+{
+  * These outermost fold_<op> functions are sorted alphabetically.
-+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+  */
-+    ptrdiff_t offset = tcg_ptr_byte_diff(target, src_rx) - 8;
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+    int rot = encode_imm(offset);
-+
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+    if (rot >= 0) {
+ {
-+        *src_rw = deposit32(*src_rw, 0, 12, rol32(offset, rot) | (rot << 7));
+-    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
-+}
+ }
-+
- static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
-                         intptr_t value, intptr_t addend)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
  static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
-     tcg_debug_assert(addend == 0);
+-    return fold_const2(ctx, op);
--
++    if (fold_const2(ctx, op) ||
--    if (type == R_ARM_PC24) {
++        fold_xx_to_i(ctx, op, 0)) {
 +    switch (type) {
 +    case R_ARM_PC24:
          return reloc_pc24(code_ptr, (const tcg_insn_unit *)value);
 -    } else if (type == R_ARM_PC13) {
 +    case R_ARM_PC13:
          return reloc_pc13(code_ptr, (const tcg_insn_unit *)value);
 -    } else {
 +    case R_ARM_PC11:
 +        return reloc_pc11(code_ptr, (const tcg_insn_unit *)value);
 +    case R_ARM_PC8:
 +        return reloc_pc8(code_ptr, (const tcg_insn_unit *)value);
 +    default:
          g_assert_not_reached();
      }
  }
@@ -XXX,XX +XXX,XX @@ static inline uint32_t rotl(uint32_t val, int n)
  /* ARM immediates for ALU instructions are made of an unsigned 8-bit
     right-rotated by an even amount between 0 and 30. */
 -static inline int encode_imm(uint32_t imm)
 +static int encode_imm(uint32_t imm)
  {
      int shift;
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
      return encode_imm(imm) >= 0;
  }
 +/* Return true if v16 is a valid 16-bit shifted immediate.  */
 +static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
 +{
 +    if (v16 == (v16 & 0xff)) {
 +        *cmode = 0x8;
 +        *imm8 = v16 & 0xff;
 +        return true;
 +    } else if (v16 == (v16 & 0xff00)) {
 +        *cmode = 0xa;
 +        *imm8 = v16 >> 8;
 +        return true;
 +    }
 +    return false;
-+}
+ }
-+
-+/* Return true if v32 is a valid 32-bit shifted immediate.  */
+ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
-+static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
-+{
-+    if (v32 == (v32 & 0xff)) {
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
-+        *cmode = 0x0;
+ {
-+        *imm8 = v32 & 0xff;
+-    return fold_const2(ctx, op);
-+        return true;
++    if (fold_const2(ctx, op) ||
-+    } else if (v32 == (v32 & 0xff00)) {
++        fold_xx_to_i(ctx, op, 0)) {
 +        *cmode = 0x2;
 +        *imm8 = (v32 >> 8) & 0xff;
 +        return true;
 +    } else if (v32 == (v32 & 0xff0000)) {
 +        *cmode = 0x4;
 +        *imm8 = (v32 >> 16) & 0xff;
 +        return true;
 +    } else if (v32 == (v32 & 0xff000000)) {
 +        *cmode = 0x6;
 +        *imm8 = v32 >> 24;
 +        return true;
 +    }
 +    return false;
-+}
-+
-+/* Return true if v32 is a valid 32-bit shifting ones immediate.  */
-+static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
-+{
-+    if ((v32 & 0xffff00ff) == 0xff) {
-+        *cmode = 0xc;
-+        *imm8 = (v32 >> 8) & 0xff;
-+        return true;
-+    } else if ((v32 & 0xff00ffff) == 0xffff) {
-+        *cmode = 0xd;
-+        *imm8 = (v32 >> 16) & 0xff;
-+        return true;
-+    }
-+    return false;
-+}
-+
-+/*
-+ * Return non-zero if v32 can be formed by MOVI+ORR.
-+ * Place the parameters for MOVI in (cmode, imm8).
-+ * Return the cmode for ORR; the imm8 can be had via extraction from v32.
-+ */
-+static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
-+{
-+    int i;
-+
-+    for (i = 6; i > 0; i -= 2) {
-+        /* Mask out one byte we can add with ORR.  */
-+        uint32_t tmp = v32 & ~(0xffu << (i * 4));
-+        if (is_shimm32(tmp, cmode, imm8) ||
-+            is_soimm32(tmp, cmode, imm8)) {
-+            break;
-+        }
-+    }
-+    return i;
-+}
-+
- /* Test if a constant matches the constraint.
-  * TODO: define constraints for:
-  *
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
-               encode_vd(d) | encode_vn(n) | encode_vm(m));
  }
-+static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
+ /* Propagate constants and copies, fold constant expressions. */
-+                          int q, int op, int cmode, uint8_t imm8)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+{
+             break;
-+    tcg_out32(s, INSN_VMOVI | encode_vd(rd) | (q << 6) | (op << 5)
+         }
-+              | (cmode << 8) | extract32(imm8, 0, 4)
-+              | (extract32(imm8, 4, 3) << 16)
+-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-+              | (extract32(imm8, 7, 1) << 24));
+-        switch (opc) {
-+}
+-        CASE_OP_32_64_VEC(andc):
-+
+-        CASE_OP_32_64_VEC(sub):
- static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+-        CASE_OP_32_64_VEC(xor):
-                           TCGReg rd, TCGReg rn, int offset)
+-            if (args_are_copies(op->args[1], op->args[2])) {
- {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
+-                continue;
-     tcg_out_movi32(s, COND_AL, ret, arg);
+-            }
- }
+-            break;
+-        default:
-+/* Type is always V128, with I64 elements.  */
+-            break;
-+static void tcg_out_dup2_vec(TCGContext *s, TCGReg rd, TCGReg rl, TCGReg rh)
+-        }
-+{
+-
-+    /* Move high element into place first. */
+         /*
-+    /* VMOV Dd+1, Ds */
+          * Process each opcode.
-+    tcg_out_vreg3(s, INSN_VORR | (1 << 12), 0, 0, rd, rh, rh);
+          * Sorted alphabetically by opcode as much as possible.
 +    /* Move low element into place; tcg_out_mov will check for nop. */
 +    tcg_out_mov(s, TCG_TYPE_V64, rd, rl);
 +}
 +
  static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg rd, TCGReg rs)
  {
 -    g_assert_not_reached();
 +    int q = type - TCG_TYPE_V64;
 +
 +    if (vece == MO_64) {
 +        if (type == TCG_TYPE_V128) {
 +            tcg_out_dup2_vec(s, rd, rs, rs);
 +        } else {
 +            tcg_out_mov(s, TCG_TYPE_V64, rd, rs);
 +        }
 +    } else if (rs < TCG_REG_Q0) {
 +        int b = (vece == MO_8);
 +        int e = (vece == MO_16);
 +        tcg_out32(s, INSN_VDUP_G | (b << 22) | (q << 21) | (e << 5) |
 +                  encode_vn(rd) | (rs << 12));
 +    } else {
 +        int imm4 = 1 << vece;
 +        tcg_out32(s, INSN_VDUP_S | (imm4 << 16) | (q << 6) |
 +                  encode_vd(rd) | encode_vm(rs));
 +    }
 +    return true;
  }
  static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                               TCGReg rd, TCGReg base, intptr_t offset)
  {
 -    g_assert_not_reached();
 +    if (vece == MO_64) {
 +        tcg_out_ld(s, TCG_TYPE_V64, rd, base, offset);
 +        if (type == TCG_TYPE_V128) {
 +            tcg_out_dup2_vec(s, rd, rd, rd);
 +        }
 +    } else {
 +        int q = type - TCG_TYPE_V64;
 +        tcg_out_vldst(s, INSN_VLD1R | (vece << 6) | (q << 5),
 +                      rd, base, offset);
 +    }
 +    return true;
  }
  static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                               TCGReg rd, int64_t v64)
  {
 -    g_assert_not_reached();
 +    int q = type - TCG_TYPE_V64;
 +    int cmode, imm8, i;
 +
 +    /* Test all bytes equal first.  */
 +    if (vece == MO_8) {
 +        tcg_out_vmovi(s, rd, q, 0, 0xe, v64);
 +        return;
 +    }
 +
 +    /*
 +     * Test all bytes 0x00 or 0xff second.  This can match cases that
 +     * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
 +     */
 +    for (i = imm8 = 0; i < 8; i++) {
 +        uint8_t byte = v64 >> (i * 8);
 +        if (byte == 0xff) {
 +            imm8 |= 1 << i;
 +        } else if (byte != 0) {
 +            goto fail_bytes;
 +        }
 +    }
 +    tcg_out_vmovi(s, rd, q, 1, 0xe, imm8);
 +    return;
 + fail_bytes:
 +
 +    /*
 +     * Tests for various replications.  For each element width, if we
 +     * cannot find an expansion there's no point checking a larger
 +     * width because we already know by replication it cannot match.
 +     */
 +    if (vece == MO_16) {
 +        uint16_t v16 = v64;
 +
 +        if (is_shimm16(v16, &cmode, &imm8)) {
 +            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
 +            return;
 +        }
 +        if (is_shimm16(~v16, &cmode, &imm8)) {
 +            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
 +            return;
 +        }
 +
 +        /*
 +         * Otherwise, all remaining constants can be loaded in two insns:
 +         * rd = v16 & 0xff, rd |= v16 & 0xff00.
 +         */
 +        tcg_out_vmovi(s, rd, q, 0, 0x8, v16 & 0xff);
 +        tcg_out_vmovi(s, rd, q, 0, 0xb, v16 >> 8);   /* VORRI */
 +        return;
 +    }
 +
 +    if (vece == MO_32) {
 +        uint32_t v32 = v64;
 +
 +        if (is_shimm32(v32, &cmode, &imm8) ||
 +            is_soimm32(v32, &cmode, &imm8)) {
 +            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
 +            return;
 +        }
 +        if (is_shimm32(~v32, &cmode, &imm8) ||
 +            is_soimm32(~v32, &cmode, &imm8)) {
 +            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
 +            return;
 +        }
 +
 +        /*
 +         * Restrict the set of constants to those we can load with
 +         * two instructions.  Others we load from the pool.
 +         */
 +        i = is_shimm32_pair(v32, &cmode, &imm8);
 +        if (i) {
 +            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
 +            tcg_out_vmovi(s, rd, q, 0, i | 1, extract32(v32, i * 4, 8));
 +            return;
 +        }
 +        i = is_shimm32_pair(~v32, &cmode, &imm8);
 +        if (i) {
 +            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
 +            tcg_out_vmovi(s, rd, q, 1, i | 1, extract32(~v32, i * 4, 8));
 +            return;
 +        }
 +    }
 +
 +    /*
 +     * As a last resort, load from the constant pool.
 +     */
 +    if (!q || vece == MO_64) {
 +        new_pool_l2(s, R_ARM_PC11, s->code_ptr, 0, v64, v64 >> 32);
 +        /* VLDR Dd, [pc + offset] */
 +        tcg_out32(s, INSN_VLDR_D | encode_vd(rd) | (0xf << 16));
 +        if (q) {
 +            tcg_out_dup2_vec(s, rd, rd, rd);
 +        }
 +    } else {
 +        new_pool_label(s, (uint32_t)v64, R_ARM_PC8, s->code_ptr, 0);
 +        /* add tmp, pc, offset */
 +        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_TMP, TCG_REG_PC, 0);
 +        tcg_out_dupm_vec(s, type, MO_32, rd, TCG_REG_TMP, 0);
 +    }
  }
  static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 --
 .25.1

-New patch
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
+Pull the "op r, a, a => mov r, a" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
+file changed, 24 insertions(+), 15 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+     return false;
+ }
++/* If the binary operation has both arguments equal, fold to identity. */
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
++{
++    if (args_are_copies(op->args[1], op->args[2])) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++    }
++    return false;
++}
++
+ /*
+  * These outermost fold_<op> functions are sorted alphabetically.
++ *
++ * The ordering of the transformations should be:
++ *   1) those that produce a constant
++ *   2) those that produce a copy
++ *   3) those that produce information about the result value.
+  */
+ static bool fold_add(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_x(ctx, op)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_x(ctx, op)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(or):
+-        CASE_OP_32_64_VEC(and):
+-            if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-New patch
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
+file changed, 20 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++/* If the binary operation has second argument @i, fold to @i. */
++static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /* If the binary operation has both arguments equal, fold to @i. */
+ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             continue;
+         }
+-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(and):
+-        CASE_OP_32_64_VEC(mul):
+-        CASE_OP_32_64(muluh):
+-        CASE_OP_32_64(mulsh):
+-            if (arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-New patch
+[PULL 38/56] tcg/optimize: Add type to OptContext
+Compute the type of the operation early.
 There are at least 4 places that used a def->flags ladder
 to determine the type of the operation being optimized.
 There were two places that assumed !TCG_OPF_64BIT means
 TCG_TYPE_I32, and so could potentially compute incorrect
 results for vector operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 file changed, 89 insertions(+), 60 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t z_mask;
 +    TCGType type;
  } OptContext;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
 -    const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
      uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      reset_ts(dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_mov_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_mov_i64;
 -    } else {
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
          new_op = INDEX_op_mov_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        new_op = INDEX_op_mov_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
      op->args[0] = dst;
      op->args[1] = src;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      }
  }
 -static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 +static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
 +                                    uint64_t x, uint64_t y)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op];
      uint64_t res = do_constant_folding_2(op, x, y);
 -    if (!(def->flags & TCG_OPF_64BIT)) {
 +    if (type == TCG_TYPE_I32) {
          res = (int32_t)res;
      }
      return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
   * Return -1 if the condition can't be simplified,
   * and the result of the condition (0 or 1) if it can.
   */
 -static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
      if (arg_is_const(x) && arg_is_const(y)) {
 -        const TCGOpDef *def = &tcg_op_defs[op];
 -        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
 -        if (def->flags & TCG_OPF_64BIT) {
 -            return do_constant_folding_cond_64(xv, yv, c);
 -        } else {
 +        switch (type) {
 +        case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
 +        case TCG_TYPE_I64:
 +            return do_constant_folding_cond_64(xv, yv, c);
 +        default:
 +            /* Only scalar comparisons are optimizable */
 +            return -1;
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, 0);
 +        t = do_constant_folding(op->opc, ctx->type, t, 0);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
 -        t1 = do_constant_folding(op->opc, t1, t2);
 +        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
 +        case TCG_TYPE_I64:
 +            opc = INDEX_op_setcond_i64;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          if (tv == 1 && fv == 0) {
              op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
          /* For commutative operations make constant second argument */
          switch (opc) {
          CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Proceed with possible constant folding. */
                      break;
                  }
 -                if (opc == INDEX_op_sub_i32) {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      neg_op = INDEX_op_neg_i32;
                      have_neg = TCG_TARGET_HAS_neg_i32;
 -                } else if (opc == INDEX_op_sub_i64) {
 +                    break;
 +                case TCG_TYPE_I64:
                      neg_op = INDEX_op_neg_i64;
                      have_neg = TCG_TARGET_HAS_neg_i64;
 -                } else if (TCG_TARGET_HAS_neg_vec) {
 -                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -                    unsigned vece = TCGOP_VECE(op);
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
 -                } else {
                      break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    neg_op = INDEX_op_neg_vec;
 +                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 +                                                   TCGOP_VECE(op)) > 0;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_neg) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGOpcode not_op;
                  bool have_not;
 -                if (def->flags & TCG_OPF_VECTOR) {
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-New patch
+[PULL 39/56] tcg/optimize: Split out fold_to_not
+Split out the conditional conversion from a more complex logical
 operation to a simple NOT.  Create a couple more helpers to make
 this easy for the outer-most logical operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 file changed, 86 insertions(+), 72 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +/*
 + * Convert @op to NOT, if NOT is supported by the host.
 + * Return true f the conversion is successful, which will still
 + * indicate that the processing is complete.
 + */
 +static bool fold_not(OptContext *ctx, TCGOp *op);
 +static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
 +{
 +    TCGOpcode not_op;
 +    bool have_not;
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        not_op = INDEX_op_not_i32;
 +        have_not = TCG_TARGET_HAS_not_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        not_op = INDEX_op_not_i64;
 +        have_not = TCG_TARGET_HAS_not_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        not_op = INDEX_op_not_vec;
 +        have_not = TCG_TARGET_HAS_not_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              }
              break;
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(nand):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64(nor):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(andc):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == -1) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        try_not:
 -            {
 -                TCGOpcode not_op;
 -                bool have_not;
 -
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    not_op = INDEX_op_not_i32;
 -                    have_not = TCG_TARGET_HAS_not_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
          }
 --
 .25.1

-New patch
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
+Even though there is only one user, place this more complex
+conversion into its own helper.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
+file changed, 47 insertions(+), 42 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const1(ctx, op);
++    if (fold_const1(ctx, op)) {
++        return true;
++    }
++    /*
++     * Because of fold_sub_to_neg, we want to always return true,
++     * via finish_folding.
++     */
++    finish_folding(ctx, op);
++    return true;
+ }
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode neg_op;
++    bool have_neg;
++
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
++        return false;
++    }
++
++    switch (ctx->type) {
++    case TCG_TYPE_I32:
++        neg_op = INDEX_op_neg_i32;
++        have_neg = TCG_TARGET_HAS_neg_i32;
++        break;
++    case TCG_TYPE_I64:
++        neg_op = INDEX_op_neg_i64;
++        have_neg = TCG_TARGET_HAS_neg_i64;
++        break;
++    case TCG_TYPE_V64:
++    case TCG_TYPE_V128:
++    case TCG_TYPE_V256:
++        neg_op = INDEX_op_neg_vec;
++        have_neg = (TCG_TARGET_HAS_neg_vec &&
++                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    if (have_neg) {
++        op->opc = neg_op;
++        op->args[1] = op->args[2];
++        return fold_neg(ctx, op);
++    }
++    return false;
++}
++
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+-        fold_xx_to_i(ctx, op, 0)) {
++        fold_xx_to_i(ctx, op, 0) ||
++        fold_sub_to_neg(ctx, op)) {
+         return true;
+     }
+     return false;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 continue;
+             }
+             break;
+-        CASE_OP_32_64_VEC(sub):
+-            {
+-                TCGOpcode neg_op;
+-                bool have_neg;
+-
+-                if (arg_is_const(op->args[2])) {
+-                    /* Proceed with possible constant folding. */
+-                    break;
+-                }
+-                switch (ctx.type) {
+-                case TCG_TYPE_I32:
+-                    neg_op = INDEX_op_neg_i32;
+-                    have_neg = TCG_TARGET_HAS_neg_i32;
+-                    break;
+-                case TCG_TYPE_I64:
+-                    neg_op = INDEX_op_neg_i64;
+-                    have_neg = TCG_TARGET_HAS_neg_i64;
+-                    break;
+-                case TCG_TYPE_V64:
+-                case TCG_TYPE_V128:
+-                case TCG_TYPE_V256:
+-                    neg_op = INDEX_op_neg_vec;
+-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+-                                                   TCGOP_VECE(op)) > 0;
+-                    break;
+-                default:
+-                    g_assert_not_reached();
+-                }
+-                if (!have_neg) {
+-                    break;
+-                }
+-                if (arg_is_const(op->args[1])
+-                    && arg_info(op->args[1])->val == 0) {
+-                    op->opc = neg_op;
+-                    reset_temp(op->args[0]);
+-                    op->args[1] = op->args[2];
+-                    continue;
+-                }
+-            }
+-            break;
+         default:
+             break;
+         }
+--
+.25.1

-New patch
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
+Pull the "op r, a, i => mov r, a" optimization into a function,
+and use them in the outer-most logical operations.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
+file changed, 26 insertions(+), 35 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+     return false;
+ }
++/* If the binary operation has second argument @i, fold to identity. */
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++    }
++    return false;
++}
++
+ /* If the binary operation has second argument @i, fold to NOT. */
+ static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+ static bool fold_add(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xi_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, -1) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_ix_to_not(ctx, op, -1)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, -1) ||
+         fold_xi_to_not(ctx, op, 0)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, -1) ||
+         fold_ix_to_not(ctx, op, 0)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_sub_to_neg(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_xi_to_not(ctx, op, -1)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expression for "op r, a, const => mov r, a" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(add):
+-        CASE_OP_32_64_VEC(sub):
+-        CASE_OP_32_64_VEC(or):
+-        CASE_OP_32_64_VEC(xor):
+-        CASE_OP_32_64_VEC(andc):
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
+-        CASE_OP_32_64(sar):
+-        CASE_OP_32_64(rotl):
+-        CASE_OP_32_64(rotr):
+-            if (!arg_is_const(op->args[1])
+-                && arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-                continue;
+-            }
+-            break;
+-        CASE_OP_32_64_VEC(and):
+-        CASE_OP_32_64_VEC(orc):
+-        CASE_OP_32_64(eqv):
+-            if (!arg_is_const(op->args[1])
+-                && arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == -1) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
+         z_mask = -1;
+--
+.25.1

-New patch
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
+and use it in fold_shift.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 28 ++++++++++------------------
+file changed, 10 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+     return false;
+ }
++/* If the binary operation has first argument @i, fold to @i. */
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /* If the binary operation has first argument @i, fold to NOT. */
+ static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_ix_to_i(ctx, op, 0) ||
+         fold_xi_to_x(ctx, op, 0)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
+-           and "sub r, 0, a => neg r, a" case.  */
+-        switch (opc) {
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
+-        CASE_OP_32_64(sar):
+-        CASE_OP_32_64(rotl):
+-        CASE_OP_32_64(rotr):
+-            if (arg_is_const(op->args[1])
+-                && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
+         z_mask = -1;
+--
+.25.1

-New patch
+[PULL 43/56] tcg/optimize: Split out fold_masks
+Move all of the known-zero optimizations into the per-opcode
+functions.  Use fold_masks when there is a possibility of the
+result being determined, and simply set ctx->z_mask otherwise.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
+file changed, 294 insertions(+), 251 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     TCGTempSet temps_used;
+     /* In flight values from optimization. */
+-    uint64_t z_mask;
++    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
++    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+     TCGType type;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++static bool fold_masks(OptContext *ctx, TCGOp *op)
++{
++    uint64_t a_mask = ctx->a_mask;
++    uint64_t z_mask = ctx->z_mask;
++
++    /*
++     * 32-bit ops generate 32-bit results.  For the result is zero test
++     * below, we can ignore high bits, but for further optimizations we
++     * need to record that the high bits contain garbage.
++     */
++    if (ctx->type == TCG_TYPE_I32) {
++        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
++        a_mask &= MAKE_64BIT_MASK(0, 32);
++        z_mask &= MAKE_64BIT_MASK(0, 32);
++    }
++
++    if (z_mask == 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
++    }
++    if (a_mask == 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++    }
++    return false;
++}
++
+ /*
+  * Convert @op to NOT, if NOT is supported by the host.
+  * Return true f the conversion is successful, which will still
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z1, z2;
++
+     if (fold_const2(ctx, op) ||
+         fold_xi_to_i(ctx, op, 0) ||
+         fold_xi_to_x(ctx, op, -1) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+-    return false;
++
++    z1 = arg_info(op->args[1])->z_mask;
++    z2 = arg_info(op->args[2])->z_mask;
++    ctx->z_mask = z1 & z2;
++
++    /*
++     * Known-zeros does not imply known-ones.  Therefore unless
++     * arg2 is constant, we can't infer affected bits from it.
++     */
++    if (arg_is_const(op->args[2])) {
++        ctx->a_mask = z1 & ~z2;
++    }
++
++    return fold_masks(ctx, op);
+ }
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z1;
++
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
+         fold_xi_to_x(ctx, op, 0) ||
+         fold_ix_to_not(ctx, op, -1)) {
+         return true;
+     }
+-    return false;
++
++    z1 = arg_info(op->args[1])->z_mask;
++
++    /*
++     * Known-zeros does not imply known-ones.  Therefore unless
++     * arg2 is constant, we can't infer anything from it.
++     */
++    if (arg_is_const(op->args[2])) {
++        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
++        ctx->a_mask = z1 & ~z2;
++        z1 &= z2;
++    }
++    ctx->z_mask = z1;
++
++    return fold_masks(ctx, op);
+ }
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask, sign;
++
+     if (arg_is_const(op->args[1])) {
+         uint64_t t = arg_info(op->args[1])->val;
+         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    return false;
++
++    z_mask = arg_info(op->args[1])->z_mask;
++    switch (op->opc) {
++    case INDEX_op_bswap16_i32:
++    case INDEX_op_bswap16_i64:
++        z_mask = bswap16(z_mask);
++        sign = INT16_MIN;
++        break;
++    case INDEX_op_bswap32_i32:
++    case INDEX_op_bswap32_i64:
++        z_mask = bswap32(z_mask);
++        sign = INT32_MIN;
++        break;
++    case INDEX_op_bswap64_i64:
++        z_mask = bswap64(z_mask);
++        sign = INT64_MIN;
++        break;
++    default:
++        g_assert_not_reached();
++    }
++
++    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
++    case TCG_BSWAP_OZ:
++        break;
++    case TCG_BSWAP_OS:
++        /* If the sign bit may be 1, force all the bits above to 1. */
++        if (z_mask & sign) {
++            z_mask |= sign;
++        }
++        break;
++    default:
++        /* The high bits are undefined: force all bits above the sign to 1. */
++        z_mask |= sign << 1;
++        break;
++    }
++    ctx->z_mask = z_mask;
++
++    return fold_masks(ctx, op);
+ }
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask;
++
+     if (arg_is_const(op->args[1])) {
+         uint64_t t = arg_info(op->args[1])->val;
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+         }
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
+     }
++
++    switch (ctx->type) {
++    case TCG_TYPE_I32:
++        z_mask = 31;
++        break;
++    case TCG_TYPE_I64:
++        z_mask = 63;
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
++
+     return false;
+ }
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const1(ctx, op);
++    if (fold_const1(ctx, op)) {
++        return true;
++    }
++
++    switch (ctx->type) {
++    case TCG_TYPE_I32:
++        ctx->z_mask = 32 | 31;
++        break;
++    case TCG_TYPE_I64:
++        ctx->z_mask = 64 | 63;
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    return false;
+ }
+ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+         t1 = deposit64(t1, op->args[3], op->args[4], t2);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+     }
++
++    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
++                            op->args[3], op->args[4],
++                            arg_info(op->args[2])->z_mask);
+     return false;
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask_old, z_mask;
++
+     if (arg_is_const(op->args[1])) {
+         uint64_t t;
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
+         t = extract64(t, op->args[2], op->args[3]);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    return false;
++
++    z_mask_old = arg_info(op->args[1])->z_mask;
++    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
++    if (op->args[2] == 0) {
++        ctx->a_mask = z_mask_old ^ z_mask;
++    }
++    ctx->z_mask = z_mask;
++
++    return fold_masks(ctx, op);
+ }
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const1(ctx, op);
++    uint64_t z_mask_old, z_mask, sign;
++    bool type_change = false;
++
++    if (fold_const1(ctx, op)) {
++        return true;
++    }
++
++    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
++
++    switch (op->opc) {
++    CASE_OP_32_64(ext8s):
++        sign = INT8_MIN;
++        z_mask = (uint8_t)z_mask;
++        break;
++    CASE_OP_32_64(ext16s):
++        sign = INT16_MIN;
++        z_mask = (uint16_t)z_mask;
++        break;
++    case INDEX_op_ext_i32_i64:
++        type_change = true;
++        QEMU_FALLTHROUGH;
++    case INDEX_op_ext32s_i64:
++        sign = INT32_MIN;
++        z_mask = (uint32_t)z_mask;
++        break;
++    default:
++        g_assert_not_reached();
++    }
++
++    if (z_mask & sign) {
++        z_mask |= sign;
++    } else if (!type_change) {
++        ctx->a_mask = z_mask_old ^ z_mask;
++    }
++    ctx->z_mask = z_mask;
++
++    return fold_masks(ctx, op);
+ }
+ static bool fold_extu(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const1(ctx, op);
++    uint64_t z_mask_old, z_mask;
++    bool type_change = false;
++
++    if (fold_const1(ctx, op)) {
++        return true;
++    }
++
++    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
++
++    switch (op->opc) {
++    CASE_OP_32_64(ext8u):
++        z_mask = (uint8_t)z_mask;
++        break;
++    CASE_OP_32_64(ext16u):
++        z_mask = (uint16_t)z_mask;
++        break;
++    case INDEX_op_extrl_i64_i32:
++    case INDEX_op_extu_i32_i64:
++        type_change = true;
++        QEMU_FALLTHROUGH;
++    case INDEX_op_ext32u_i64:
++        z_mask = (uint32_t)z_mask;
++        break;
++    case INDEX_op_extrh_i64_i32:
++        type_change = true;
++        z_mask >>= 32;
++        break;
++    default:
++        g_assert_not_reached();
++    }
++
++    ctx->z_mask = z_mask;
++    if (!type_change) {
++        ctx->a_mask = z_mask_old ^ z_mask;
++    }
++    return fold_masks(ctx, op);
+ }
+ static bool fold_mb(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+     }
++    ctx->z_mask = arg_info(op->args[3])->z_mask
++                | arg_info(op->args[4])->z_mask;
++
+     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+         uint64_t tv = arg_info(op->args[3])->val;
+         uint64_t fv = arg_info(op->args[4])->val;
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
+ {
++    uint64_t z_mask;
++
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
++
++    /* Set to 1 all bits to the left of the rightmost.  */
++    z_mask = arg_info(op->args[1])->z_mask;
++    ctx->z_mask = -(z_mask & -z_mask);
++
+     /*
+      * Because of fold_sub_to_neg, we want to always return true,
+      * via finish_folding.
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+-    return false;
++
++    ctx->z_mask = arg_info(op->args[1])->z_mask
++                | arg_info(op->args[2])->z_mask;
++    return fold_masks(ctx, op);
+ }
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
+ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+ {
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
++    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
++    MemOp mop = get_memop(oi);
++    int width = 8 * memop_size(mop);
++
++    if (!(mop & MO_SIGN) && width < 64) {
++        ctx->z_mask = MAKE_64BIT_MASK(0, width);
++    }
++
+     /* Opcodes that touch guest memory stop the mb optimization.  */
+     ctx->prev_mb = NULL;
+     return false;
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
+     if (i >= 0) {
+         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+     }
++
++    ctx->z_mask = 1;
+     return false;
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+         op->opc = INDEX_op_setcond_i32;
+         break;
+     }
++
++    ctx->z_mask = 1;
+     return false;
+  do_setcond_const:
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ {
++    int64_t z_mask_old, z_mask;
++
+     if (arg_is_const(op->args[1])) {
+         uint64_t t;
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+         t = sextract64(t, op->args[2], op->args[3]);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    return false;
++
++    z_mask_old = arg_info(op->args[1])->z_mask;
++    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
++    if (op->args[2] == 0 && z_mask >= 0) {
++        ctx->a_mask = z_mask_old ^ z_mask;
++    }
++    ctx->z_mask = z_mask;
++
++    return fold_masks(ctx, op);
+ }
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+         fold_xi_to_x(ctx, op, 0)) {
+         return true;
+     }
++
++    if (arg_is_const(op->args[2])) {
++        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
++                                          arg_info(op->args[1])->z_mask,
++                                          arg_info(op->args[2])->val);
++        return fold_masks(ctx, op);
++    }
+     return false;
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+     return fold_addsub2_i32(ctx, op, false);
+ }
++static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
++{
++    /* We can't do any folding with a load, but we can record bits. */
++    switch (op->opc) {
++    CASE_OP_32_64(ld8u):
++        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
++        break;
++    CASE_OP_32_64(ld16u):
++        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
++        break;
++    case INDEX_op_ld32u_i64:
++        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    return false;
++}
++
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
+         fold_xi_to_not(ctx, op, -1)) {
+         return true;
+     }
+-    return false;
++
++    ctx->z_mask = arg_info(op->args[1])->z_mask
++                | arg_info(op->args[2])->z_mask;
++    return fold_masks(ctx, op);
+ }
+ /* Propagate constants and copies, fold constant expressions. */
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     }
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+-        uint64_t z_mask, partmask, affected, tmp;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
+         bool done = false;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify using known-zero bits. Currently only ops with a single
+-           output argument is supported. */
+-        z_mask = -1;
+-        affected = -1;
+-        switch (opc) {
+-        CASE_OP_32_64(ext8s):
+-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
+-                break;
+-            }
+-            QEMU_FALLTHROUGH;
+-        CASE_OP_32_64(ext8u):
+-            z_mask = 0xff;
+-            goto and_const;
+-        CASE_OP_32_64(ext16s):
+-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
+-                break;
+-            }
+-            QEMU_FALLTHROUGH;
+-        CASE_OP_32_64(ext16u):
+-            z_mask = 0xffff;
+-            goto and_const;
+-        case INDEX_op_ext32s_i64:
+-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
+-                break;
+-            }
+-            QEMU_FALLTHROUGH;
+-        case INDEX_op_ext32u_i64:
+-            z_mask = 0xffffffffU;
+-            goto and_const;
+-
+-        CASE_OP_32_64(and):
+-            z_mask = arg_info(op->args[2])->z_mask;
+-            if (arg_is_const(op->args[2])) {
+-        and_const:
+-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+-            }
+-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
+-            break;
+-
+-        case INDEX_op_ext_i32_i64:
+-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
+-                break;
+-            }
+-            QEMU_FALLTHROUGH;
+-        case INDEX_op_extu_i32_i64:
+-            /* We do not compute affected as it is a size changing op.  */
+-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
+-            break;
+-
+-        CASE_OP_32_64(andc):
+-            /* Known-zeros does not imply known-ones.  Therefore unless
+-               op->args[2] is constant, we can't infer anything from it.  */
+-            if (arg_is_const(op->args[2])) {
+-                z_mask = ~arg_info(op->args[2])->z_mask;
+-                goto and_const;
+-            }
+-            /* But we certainly know nothing outside args[1] may be set. */
+-            z_mask = arg_info(op->args[1])->z_mask;
+-            break;
+-
+-        case INDEX_op_sar_i32:
+-            if (arg_is_const(op->args[2])) {
+-                tmp = arg_info(op->args[2])->val & 31;
+-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
+-            }
+-            break;
+-        case INDEX_op_sar_i64:
+-            if (arg_is_const(op->args[2])) {
+-                tmp = arg_info(op->args[2])->val & 63;
+-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
+-            }
+-            break;
+-
+-        case INDEX_op_shr_i32:
+-            if (arg_is_const(op->args[2])) {
+-                tmp = arg_info(op->args[2])->val & 31;
+-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
+-            }
+-            break;
+-        case INDEX_op_shr_i64:
+-            if (arg_is_const(op->args[2])) {
+-                tmp = arg_info(op->args[2])->val & 63;
+-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
+-            }
+-            break;
+-
+-        case INDEX_op_extrl_i64_i32:
+-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
+-            break;
+-        case INDEX_op_extrh_i64_i32:
+-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
+-            break;
+-
+-        CASE_OP_32_64(shl):
+-            if (arg_is_const(op->args[2])) {
+-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
+-                z_mask = arg_info(op->args[1])->z_mask << tmp;
+-            }
+-            break;
+-
+-        CASE_OP_32_64(neg):
+-            /* Set to 1 all bits to the left of the rightmost.  */
+-            z_mask = -(arg_info(op->args[1])->z_mask
+-                       & -arg_info(op->args[1])->z_mask);
+-            break;
+-
+-        CASE_OP_32_64(deposit):
+-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+-                               op->args[3], op->args[4],
+-                               arg_info(op->args[2])->z_mask);
+-            break;
+-
+-        CASE_OP_32_64(extract):
+-            z_mask = extract64(arg_info(op->args[1])->z_mask,
+-                               op->args[2], op->args[3]);
+-            if (op->args[2] == 0) {
+-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+-            }
+-            break;
+-        CASE_OP_32_64(sextract):
+-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+-                                op->args[2], op->args[3]);
+-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+-            }
+-            break;
+-
+-        CASE_OP_32_64(or):
+-        CASE_OP_32_64(xor):
+-            z_mask = arg_info(op->args[1])->z_mask
+-                   | arg_info(op->args[2])->z_mask;
+-            break;
+-
+-        case INDEX_op_clz_i32:
+-        case INDEX_op_ctz_i32:
+-            z_mask = arg_info(op->args[2])->z_mask | 31;
+-            break;
+-
+-        case INDEX_op_clz_i64:
+-        case INDEX_op_ctz_i64:
+-            z_mask = arg_info(op->args[2])->z_mask | 63;
+-            break;
+-
+-        case INDEX_op_ctpop_i32:
+-            z_mask = 32 | 31;
+-            break;
+-        case INDEX_op_ctpop_i64:
+-            z_mask = 64 | 63;
+-            break;
+-
+-        CASE_OP_32_64(setcond):
+-        case INDEX_op_setcond2_i32:
+-            z_mask = 1;
+-            break;
+-
+-        CASE_OP_32_64(movcond):
+-            z_mask = arg_info(op->args[3])->z_mask
+-                   | arg_info(op->args[4])->z_mask;
+-            break;
+-
+-        CASE_OP_32_64(ld8u):
+-            z_mask = 0xff;
+-            break;
+-        CASE_OP_32_64(ld16u):
+-            z_mask = 0xffff;
+-            break;
+-        case INDEX_op_ld32u_i64:
+-            z_mask = 0xffffffffu;
+-            break;
+-
+-        CASE_OP_32_64(qemu_ld):
+-            {
+-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+-                MemOp mop = get_memop(oi);
+-                if (!(mop & MO_SIGN)) {
+-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+-                }
+-            }
+-            break;
+-
+-        CASE_OP_32_64(bswap16):
+-            z_mask = arg_info(op->args[1])->z_mask;
+-            if (z_mask <= 0xffff) {
+-                op->args[2] |= TCG_BSWAP_IZ;
+-            }
+-            z_mask = bswap16(z_mask);
+-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+-            case TCG_BSWAP_OZ:
+-                break;
+-            case TCG_BSWAP_OS:
+-                z_mask = (int16_t)z_mask;
+-                break;
+-            default: /* undefined high bits */
+-                z_mask |= MAKE_64BIT_MASK(16, 48);
+-                break;
+-            }
+-            break;
+-
+-        case INDEX_op_bswap32_i64:
+-            z_mask = arg_info(op->args[1])->z_mask;
+-            if (z_mask <= 0xffffffffu) {
+-                op->args[2] |= TCG_BSWAP_IZ;
+-            }
+-            z_mask = bswap32(z_mask);
+-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+-            case TCG_BSWAP_OZ:
+-                break;
+-            case TCG_BSWAP_OS:
+-                z_mask = (int32_t)z_mask;
+-                break;
+-            default: /* undefined high bits */
+-                z_mask |= MAKE_64BIT_MASK(32, 32);
+-                break;
+-            }
+-            break;
+-
+-        default:
+-            break;
+-        }
+-
+-        /* 32-bit ops generate 32-bit results.  For the result is zero test
+-           below, we can ignore high bits, but for further optimizations we
+-           need to record that the high bits contain garbage.  */
+-        partmask = z_mask;
+-        if (ctx.type == TCG_TYPE_I32) {
+-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
+-            partmask &= 0xffffffffu;
+-            affected &= 0xffffffffu;
+-        }
+-        ctx.z_mask = z_mask;
+-
+-        if (partmask == 0) {
+-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-            continue;
+-        }
+-        if (affected == 0) {
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
+-        }
++        /* Assume all bits affected, and no bits known zero. */
++        ctx.a_mask = -1;
++        ctx.z_mask = -1;
+         /*
+          * Process each opcode.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             done = fold_extu(&ctx, op);
+             break;
++        CASE_OP_32_64(ld8u):
++        CASE_OP_32_64(ld16u):
++        case INDEX_op_ld32u_i64:
++            done = fold_tcg_ld(&ctx, op);
++            break;
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
+and muls2_i64.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
+file changed, 35 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return false;
+ }
+-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+ {
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-        uint32_t a = arg_info(op->args[2])->val;
+-        uint32_t b = arg_info(op->args[3])->val;
+-        uint64_t r = (uint64_t)a * b;
++        uint64_t a = arg_info(op->args[2])->val;
++        uint64_t b = arg_info(op->args[3])->val;
++        uint64_t h, l;
+         TCGArg rl, rh;
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++        TCGOp *op2;
++
++        switch (op->opc) {
++        case INDEX_op_mulu2_i32:
++            l = (uint64_t)(uint32_t)a * (uint32_t)b;
++            h = (int32_t)(l >> 32);
++            l = (int32_t)l;
++            break;
++        case INDEX_op_muls2_i32:
++            l = (int64_t)(int32_t)a * (int32_t)b;
++            h = l >> 32;
++            l = (int32_t)l;
++            break;
++        case INDEX_op_mulu2_i64:
++            mulu64(&l, &h, a, b);
++            break;
++        case INDEX_op_muls2_i64:
++            muls64(&l, &h, a, b);
++            break;
++        default:
++            g_assert_not_reached();
++        }
+         rl = op->args[0];
+         rh = op->args[1];
+-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++
++        /* The proper opcode is supplied by tcg_opt_gen_mov. */
++        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
++
++        tcg_opt_gen_movi(ctx, op, rl, l);
++        tcg_opt_gen_movi(ctx, op2, rh, h);
+         return true;
+     }
+     return false;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
+-        case INDEX_op_mulu2_i32:
+-            done = fold_mulu2_i32(&ctx, op);
++        CASE_OP_32_64(muls2):
++        CASE_OP_32_64(mulu2):
++            done = fold_multiply2(&ctx, op);
+             break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+--
+.25.1

-[PULL 04/15] tcg/arm: Implement tcg_out_mov for vector types
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Rename to fold_addsub2.
 Use Int128 to implement the wider operation.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.c.inc | 52 +++++++++++++++++++++++++++++++++++-----
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
-file changed, 46 insertions(+), 6 deletions(-)
+file changed, 44 insertions(+), 21 deletions(-)
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@
-     /* Otherwise the assembler uses mov r0,r0 */
+  */
-     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
+ #include "qemu/osdep.h"
-+    INSN_VORR      = 0xf2200110,
++#include "qemu/int128.h"
-+
+ #include "tcg/tcg-op.h"
-     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
+ #include "tcg-internal.h"
-     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
- } ARMInsn;
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vd(TCGReg rd)
      return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
  }
 +static uint32_t encode_vn(TCGReg rn)
 +{
 +    tcg_debug_assert(rn >= TCG_REG_Q0);
 +    return (extract32(rn, 3, 1) << 7) | (extract32(rn, 0, 3) << 17);
 +}
 +
 +static uint32_t encode_vm(TCGReg rm)
 +{
 +    tcg_debug_assert(rm >= TCG_REG_Q0);
 +    return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
 +}
 +
 +static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
 +                          TCGReg d, TCGReg n, TCGReg m)
 +{
 +    tcg_out32(s, insn | (vece << 20) | (q << 6) |
 +              encode_vd(d) | encode_vn(n) | encode_vm(m));
 +}
 +
  static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                            TCGReg rd, TCGReg rn, int offset)
  {
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
      return false;
  }
--static inline bool tcg_out_mov(TCGContext *s, TCGType type,
+-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
--                               TCGReg ret, TCGReg arg)
++static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 +static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
  {
--    tcg_out_mov_reg(s, COND_AL, ret, arg);
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
--    return true;
+         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-+    if (ret == arg) {
+-        uint32_t al = arg_info(op->args[2])->val;
-+        return true;
+-        uint32_t ah = arg_info(op->args[3])->val;
-+    }
+-        uint32_t bl = arg_info(op->args[4])->val;
-+    switch (type) {
+-        uint32_t bh = arg_info(op->args[5])->val;
-+    case TCG_TYPE_I32:
+-        uint64_t a = ((uint64_t)ah << 32) | al;
-+        if (ret < TCG_REG_Q0 && arg < TCG_REG_Q0) {
+-        uint64_t b = ((uint64_t)bh << 32) | bl;
-+            tcg_out_mov_reg(s, COND_AL, ret, arg);
++        uint64_t al = arg_info(op->args[2])->val;
-+            return true;
++        uint64_t ah = arg_info(op->args[3])->val;
-+        }
++        uint64_t bl = arg_info(op->args[4])->val;
-+        return false;
++        uint64_t bh = arg_info(op->args[5])->val;
          TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 -        if (add) {
 -            a += b;
 +        if (ctx->type == TCG_TYPE_I32) {
 +            uint64_t a = deposit64(al, 32, 32, ah);
 +            uint64_t b = deposit64(bl, 32, 32, bh);
 +
-+    case TCG_TYPE_V64:
++            if (add) {
-+    case TCG_TYPE_V128:
++                a += b;
-+        /* "VMOV D,N" is an alias for "VORR D,N,N". */
++            } else {
-+        tcg_out_vreg3(s, INSN_VORR, type - TCG_TYPE_V64, 0, ret, arg, arg);
++                a -= b;
-+        return true;
++            }
 +
-+    default:
++            al = sextract64(a, 0, 32);
-+        g_assert_not_reached();
++            ah = sextract64(a, 32, 32);
-+    }
+         } else {
 -            a -= b;
 +            Int128 a = int128_make128(al, ah);
 +            Int128 b = int128_make128(bl, bh);
 +
 +            if (add) {
 +                a = int128_add(a, b);
 +            } else {
 +                a = int128_sub(a, b);
 +            }
 +
 +            al = int128_getlo(a);
 +            ah = int128_gethi(a);
          }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, al);
 +        tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
      return false;
  }
--static inline void tcg_out_movi(TCGContext *s, TCGType type,
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
--                                TCGReg ret, tcg_target_long arg)
++static bool fold_add2(OptContext *ctx, TCGOp *op)
 +static void tcg_out_movi(TCGContext *s, TCGType type,
 +                         TCGReg ret, tcg_target_long arg)
  {
-+    tcg_debug_assert(type == TCG_TYPE_I32);
+-    return fold_addsub2_i32(ctx, op, true);
-+    tcg_debug_assert(ret < TCG_REG_Q0);
++    return fold_addsub2(ctx, op, true);
      tcg_out_movi32(s, COND_AL, ret, arg);
  }
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return false;
+ }
+-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++static bool fold_sub2(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_addsub2_i32(ctx, op, false);
++    return fold_addsub2(ctx, op, false);
+ }
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
+-        case INDEX_op_add2_i32:
+-            done = fold_add2_i32(&ctx, op);
++        CASE_OP_32_64(add2):
++            done = fold_add2(&ctx, op);
+             break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+-        case INDEX_op_sub2_i32:
+-            done = fold_sub2_i32(&ctx, op);
++        CASE_OP_32_64(sub2):
++            done = fold_sub2(&ctx, op);
+             break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
 --
 .25.1

-[PULL 08/15] tcg/arm: Implement TCG_TARGET_HAS_shi_vec
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
-This consists of the three immediate shifts: shli, shri, sari.
+Most of these are handled by creating a fold_const2_commutative
+to handle all of the binary operators.  The rest were already
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+handled on a case-by-case basis in the switch, and have their
 own fold function in which to place the call.
 We now have only one major switch on TCGOpcode.
 Introduce NO_DEST and a block comment for swap_commutative in
 order to make the handling of brcond and movcond opcodes cleaner.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.h     |  2 +-
+ tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
- tcg/arm/tcg-target.c.inc | 27 +++++++++++++++++++++++++++
+file changed, 70 insertions(+), 72 deletions(-)
-files changed, 28 insertions(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
- #define TCG_TARGET_HAS_roti_vec         0
+     return -1;
  #define TCG_TARGET_HAS_rots_vec         0
  #define TCG_TARGET_HAS_rotv_vec         0
 -#define TCG_TARGET_HAS_shi_vec          0
 +#define TCG_TARGET_HAS_shi_vec          1
  #define TCG_TARGET_HAS_shs_vec          0
  #define TCG_TARGET_HAS_shv_vec          0
  #define TCG_TARGET_HAS_mul_vec          0
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
      INSN_VCGE_U    = 0xf3000310,
      INSN_VCGT_U    = 0xf3000300,
 +    INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
 +    INSN_VSARI     = 0xf2800010,  /* VSHR.S */
 +    INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
 +
      INSN_VTST      = 0xf2000810,
      INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
                | (extract32(imm8, 7, 1) << 24));
  }
-+static void tcg_out_vshifti(TCGContext *s, ARMInsn insn, int q,
++/**
-+                            TCGReg rd, TCGReg rm, int l_imm6)
++ * swap_commutative:
 + * @dest: TCGArg of the destination argument, or NO_DEST.
 + * @p1: first paired argument
 + * @p2: second paired argument
 + *
 + * If *@p1 is a constant and *@p2 is not, swap.
 + * If *@p2 matches @dest, swap.
 + * Return true if a swap was performed.
 + */
 +
 +#define NO_DEST  temp_arg(NULL)
 +
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
  {
      TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 +{
-+    tcg_out32(s, insn | (q << 6) | encode_vd(rd) | encode_vm(rm) |
++    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-+              (extract32(l_imm6, 6, 1) << 7) |
++    return fold_const2(ctx, op);
 +              (extract32(l_imm6, 0, 6) << 16));
 +}
 +
- static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
-                           TCGReg rd, TCGReg rn, int offset)
+ {
- {
+     uint64_t a_mask = ctx->a_mask;
-@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
-     case INDEX_op_abs_vec:
-     case INDEX_op_neg_vec:
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-     case INDEX_op_not_vec:
+ {
-+    case INDEX_op_shli_vec:
+-    if (fold_const2(ctx, op) ||
-+    case INDEX_op_shri_vec:
++    if (fold_const2_commutative(ctx, op) ||
-+    case INDEX_op_sari_vec:
+         fold_xi_to_x(ctx, op, 0)) {
-         return C_O1_I1(w, w);
+         return true;
-     case INDEX_op_dup2_vec:
+     }
-     case INDEX_op_add_vec:
+@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-     case INDEX_op_xor_vec:
+ static bool fold_add2(OptContext *ctx, TCGOp *op)
-         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
+ {
-         return;
++    /* Note that the high and low parts may be independently swapped. */
-+    case INDEX_op_shli_vec:
++    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-+        tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
++    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-+        return;
++
-+    case INDEX_op_shri_vec:
+     return fold_addsub2(ctx, op, true);
-+        tcg_out_vshifti(s, INSN_VSHRI, q, a0, a1, (16 << vece) - a2);
+ }
-+        return;
-+    case INDEX_op_sari_vec:
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+        tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2);
+ {
-+        return;
+     uint64_t z1, z2;
-     case INDEX_op_andc_vec:
+-    if (fold_const2(ctx, op) ||
-         if (!const_args[2]) {
++    if (fold_const2_commutative(ctx, op) ||
-@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+         fold_xi_to_i(ctx, op, 0) ||
-     case INDEX_op_orc_vec:
+         fold_xi_to_x(ctx, op, -1) ||
-     case INDEX_op_xor_vec:
+         fold_xx_to_x(ctx, op)) {
-     case INDEX_op_not_vec:
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+    case INDEX_op_shli_vec:
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
-+    case INDEX_op_shri_vec:
+ {
-+    case INDEX_op_sari_vec:
+     TCGCond cond = op->args[2];
-         return 1;
+-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
-     case INDEX_op_abs_vec:
++    int i;
-     case INDEX_op_cmp_vec:
 +    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
 +        op->args[2] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* For commutative operations make constant second argument */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 -            break;
 -        CASE_OP_32_64(brcond):
 -            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
 -                op->args[2] = tcg_swap_cond(op->args[2]);
 -            }
 -            break;
 -        CASE_OP_32_64(setcond):
 -            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 -                op->args[3] = tcg_swap_cond(op->args[3]);
 -            }
 -            break;
 -        CASE_OP_32_64(movcond):
 -            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            /* For movcond, we canonicalize the "false" input reg to match
 -               the destination reg so that the tcg backend can implement
 -               a "move if true" operation.  */
 -            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -                op->args[5] = tcg_invert_cond(op->args[5]);
 -            }
 -            break;
 -        CASE_OP_32_64(add2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 -            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 -            break;
 -        CASE_OP_32_64(mulu2):
 -        CASE_OP_32_64(muls2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 -            break;
 -        case INDEX_op_brcond2_i32:
 -            if (swap_commutative2(&op->args[0], &op->args[2])) {
 -                op->args[4] = tcg_swap_cond(op->args[4]);
 -            }
 -            break;
 -        case INDEX_op_setcond2_i32:
 -            if (swap_commutative2(&op->args[1], &op->args[3])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Assume all bits affected, and no bits known zero. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 --
 .25.1

-New patch
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
+This "garbage" setting pre-dates the addition of the type
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
+and INDEX_op_extr{l,h}_i64_i32.
+So now we have a definitive points at which to adjust z_mask
+to eliminate such bits from the 32-bit operands.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
+file changed, 16 insertions(+), 19 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+         ti->is_const = true;
+         ti->val = ts->val;
+         ti->z_mask = ts->val;
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+-            /* High bits of a 32-bit quantity are garbage.  */
+-            ti->z_mask |= ~0xffffffffull;
+-        }
+     } else {
+         ti->is_const = false;
+         ti->z_mask = -1;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+     TCGTemp *src_ts = arg_temp(src);
+     TempOptInfo *di;
+     TempOptInfo *si;
+-    uint64_t z_mask;
+     TCGOpcode new_op;
+     if (ts_are_copies(dst_ts, src_ts)) {
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+     op->args[0] = dst;
+     op->args[1] = src;
+-    z_mask = si->z_mask;
+-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+-        /* High bits of the destination are now garbage.  */
+-        z_mask |= ~0xffffffffull;
+-    }
+-    di->z_mask = z_mask;
++    di->z_mask = si->z_mask;
+     if (src_ts->type == dst_ts->type) {
+         TempOptInfo *ni = ts_info(si->next_copy);
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                              TCGArg dst, uint64_t val)
+ {
+-    /* Convert movi to mov with constant temp. */
+-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
++    TCGTemp *tv;
++    if (ctx->type == TCG_TYPE_I32) {
++        val = (int32_t)val;
++    }
++
++    /* Convert movi to mov with constant temp. */
++    tv = tcg_constant_internal(ctx->type, val);
+     init_ts_info(ctx, tv);
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+     uint64_t z_mask = ctx->z_mask;
+     /*
+-     * 32-bit ops generate 32-bit results.  For the result is zero test
+-     * below, we can ignore high bits, but for further optimizations we
+-     * need to record that the high bits contain garbage.
++     * 32-bit ops generate 32-bit results, which for the purpose of
++     * simplifying tcg are sign-extended.  Certainly that's how we
++     * represent our constants elsewhere.  Note that the bits will
++     * be reset properly for a 64-bit value when encountering the
++     * type changing opcodes.
+      */
+     if (ctx->type == TCG_TYPE_I32) {
+-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+-        a_mask &= MAKE_64BIT_MASK(0, 32);
+-        z_mask &= MAKE_64BIT_MASK(0, 32);
++        a_mask = (int32_t)a_mask;
++        z_mask = (int32_t)z_mask;
++        ctx->z_mask = z_mask;
+     }
+     if (z_mask == 0) {
+--
+.25.1

-New patch
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
+Recognize the constant function for or-complement.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 1 +
+file changed, 1 insertion(+)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, -1) ||
+         fold_xi_to_x(ctx, op, -1) ||
+         fold_ix_to_not(ctx, op, 0)) {
+         return true;
+--
+.25.1

-[PULL 11/15] tcg/arm: Implement TCG_TARGET_HAS_minmax_vec
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
-This is minimum and maximum, signed and unsigned.
+Recognize the identity function for low-part multiply.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.h     |  2 +-
+ tcg/optimize.c | 3 ++-
- tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
+file changed, 2 insertions(+), 1 deletion(-)
 files changed, 25 insertions(+), 1 deletion(-)
-diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_shv_vec          0
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_mul_vec          1
+ {
- #define TCG_TARGET_HAS_sat_vec          1
+     if (fold_const2(ctx, op) ||
--#define TCG_TARGET_HAS_minmax_vec       0
+-        fold_xi_to_i(ctx, op, 0)) {
-+#define TCG_TARGET_HAS_minmax_vec       1
++        fold_xi_to_i(ctx, op, 0) ||
- #define TCG_TARGET_HAS_bitsel_vec       0
++        fold_xi_to_x(ctx, op, 1)) {
- #define TCG_TARGET_HAS_cmpsel_vec       0
+         return true;
+     }
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
+     return false;
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
      INSN_VQADD_U   = 0xf3000010,
      INSN_VQSUB     = 0xf2000210,
      INSN_VQSUB_U   = 0xf3000210,
 +    INSN_VMAX      = 0xf2000600,
 +    INSN_VMAX_U    = 0xf3000600,
 +    INSN_VMIN      = 0xf2000610,
 +    INSN_VMIN_U    = 0xf3000610,
      INSN_VABS      = 0xf3b10300,
      INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_dup2_vec:
      case INDEX_op_add_vec:
      case INDEX_op_mul_vec:
 +    case INDEX_op_smax_vec:
 +    case INDEX_op_smin_vec:
      case INDEX_op_ssadd_vec:
      case INDEX_op_sssub_vec:
      case INDEX_op_sub_vec:
 +    case INDEX_op_umax_vec:
 +    case INDEX_op_umin_vec:
      case INDEX_op_usadd_vec:
      case INDEX_op_ussub_vec:
      case INDEX_op_xor_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mul_vec:
          tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2);
          return;
 +    case INDEX_op_smax_vec:
 +        tcg_out_vreg3(s, INSN_VMAX, q, vece, a0, a1, a2);
 +        return;
 +    case INDEX_op_smin_vec:
 +        tcg_out_vreg3(s, INSN_VMIN, q, vece, a0, a1, a2);
 +        return;
      case INDEX_op_sub_vec:
          tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
          return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_sssub_vec:
          tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
          return;
 +    case INDEX_op_umax_vec:
 +        tcg_out_vreg3(s, INSN_VMAX_U, q, vece, a0, a1, a2);
 +        return;
 +    case INDEX_op_umin_vec:
 +        tcg_out_vreg3(s, INSN_VMIN_U, q, vece, a0, a1, a2);
 +        return;
      case INDEX_op_usadd_vec:
          tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
          return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
      case INDEX_op_cmp_vec:
      case INDEX_op_mul_vec:
      case INDEX_op_neg_vec:
 +    case INDEX_op_smax_vec:
 +    case INDEX_op_smin_vec:
 +    case INDEX_op_umax_vec:
 +    case INDEX_op_umin_vec:
          return vece < MO_64;
      default:
          return 0;
 --
 .25.1

-[PULL 10/15] tcg/arm: Implement TCG_TARGET_HAS_sat_vec
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
-This is saturating add and subtract, signed and unsigned.
+Recognize the identity function for division.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.h     |  2 +-
+ tcg/optimize.c | 6 +++++-
- tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
+file changed, 5 insertions(+), 1 deletion(-)
 files changed, 25 insertions(+), 1 deletion(-)
-diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_shs_vec          0
- #define TCG_TARGET_HAS_shv_vec          0
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_mul_vec          1
+ {
--#define TCG_TARGET_HAS_sat_vec          0
+-    return fold_const2(ctx, op);
-+#define TCG_TARGET_HAS_sat_vec          1
++    if (fold_const2(ctx, op) ||
- #define TCG_TARGET_HAS_minmax_vec       0
++        fold_xi_to_x(ctx, op, 1)) {
- #define TCG_TARGET_HAS_bitsel_vec       0
++        return true;
- #define TCG_TARGET_HAS_cmpsel_vec       0
++    }
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++    return false;
-index XXXXXXX..XXXXXXX 100644
+ }
---- a/tcg/arm/tcg-target.c.inc
-+++ b/tcg/arm/tcg-target.c.inc
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ typedef enum {
      INSN_VORR      = 0xf2200110,
      INSN_VSUB      = 0xf3000800,
      INSN_VMUL      = 0xf2000910,
 +    INSN_VQADD     = 0xf2000010,
 +    INSN_VQADD_U   = 0xf3000010,
 +    INSN_VQSUB     = 0xf2000210,
 +    INSN_VQSUB_U   = 0xf3000210,
      INSN_VABS      = 0xf3b10300,
      INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
      case INDEX_op_dup2_vec:
      case INDEX_op_add_vec:
      case INDEX_op_mul_vec:
 +    case INDEX_op_ssadd_vec:
 +    case INDEX_op_sssub_vec:
      case INDEX_op_sub_vec:
 +    case INDEX_op_usadd_vec:
 +    case INDEX_op_ussub_vec:
      case INDEX_op_xor_vec:
          return C_O1_I2(w, w, w);
      case INDEX_op_or_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_sub_vec:
          tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
          return;
 +    case INDEX_op_ssadd_vec:
 +        tcg_out_vreg3(s, INSN_VQADD, q, vece, a0, a1, a2);
 +        return;
 +    case INDEX_op_sssub_vec:
 +        tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
 +        return;
 +    case INDEX_op_usadd_vec:
 +        tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
 +        return;
 +    case INDEX_op_ussub_vec:
 +        tcg_out_vreg3(s, INSN_VQSUB_U, q, vece, a0, a1, a2);
 +        return;
      case INDEX_op_xor_vec:
          tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
          return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
      case INDEX_op_shli_vec:
      case INDEX_op_shri_vec:
      case INDEX_op_sari_vec:
 +    case INDEX_op_ssadd_vec:
 +    case INDEX_op_sssub_vec:
 +    case INDEX_op_usadd_vec:
 +    case INDEX_op_ussub_vec:
          return 1;
      case INDEX_op_abs_vec:
      case INDEX_op_cmp_vec:
 --
 .25.1

-[PULL 09/15] tcg/arm: Implement TCG_TARGET_HAS_mul_vec
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Recognize the constant function for remainder.
 Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target.h     | 2 +-
+ tcg/optimize.c | 6 +++++-
- tcg/arm/tcg-target.c.inc | 6 ++++++
+file changed, 5 insertions(+), 1 deletion(-)
 files changed, 7 insertions(+), 1 deletion(-)
-diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_shi_vec          1
- #define TCG_TARGET_HAS_shs_vec          0
+ static bool fold_remainder(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_shv_vec          0
+ {
--#define TCG_TARGET_HAS_mul_vec          0
+-    return fold_const2(ctx, op);
-+#define TCG_TARGET_HAS_mul_vec          1
++    if (fold_const2(ctx, op) ||
- #define TCG_TARGET_HAS_sat_vec          0
++        fold_xx_to_i(ctx, op, 0)) {
- #define TCG_TARGET_HAS_minmax_vec       0
++        return true;
- #define TCG_TARGET_HAS_bitsel_vec       0
++    }
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++    return false;
-index XXXXXXX..XXXXXXX 100644
+ }
---- a/tcg/arm/tcg-target.c.inc
-+++ b/tcg/arm/tcg-target.c.inc
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ typedef enum {
      INSN_VORN      = 0xf2300110,
      INSN_VORR      = 0xf2200110,
      INSN_VSUB      = 0xf3000800,
 +    INSN_VMUL      = 0xf2000910,
      INSN_VABS      = 0xf3b10300,
      INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
          return C_O1_I1(w, w);
      case INDEX_op_dup2_vec:
      case INDEX_op_add_vec:
 +    case INDEX_op_mul_vec:
      case INDEX_op_sub_vec:
      case INDEX_op_xor_vec:
          return C_O1_I2(w, w, w);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_add_vec:
          tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
          return;
 +    case INDEX_op_mul_vec:
 +        tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2);
 +        return;
      case INDEX_op_sub_vec:
          tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
          return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
          return 1;
      case INDEX_op_abs_vec:
      case INDEX_op_cmp_vec:
 +    case INDEX_op_mul_vec:
      case INDEX_op_neg_vec:
          return vece < MO_64;
      default:
 --
 .25.1

-[PULL 14/15] tcg/arm: Implement TCG_TARGET_HAS_roti_vec
+[PULL 52/56] tcg/optimize: Optimize sign extensions
-Implement via expansion, so don't actually set TCG_TARGET_HAS_roti_vec.
+Certain targets, like riscv, produce signed 32-bit results.
-For NEON, this is shift-right followed by shift-left-and-insert.
+This can lead to lots of redundant extensions as values are
+manipulated.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Begin by tracking only the obvious sign-extensions, and
 converting them to simple copies when possible.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target-con-set.h |  1 +
+ tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
- tcg/arm/tcg-target.opc.h     |  1 +
+file changed, 102 insertions(+), 21 deletions(-)
- tcg/arm/tcg-target.c.inc     | 15 +++++++++++++++
-files changed, 17 insertions(+)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target-con-set.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target-con-set.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIK)
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
- C_O1_I2(r, r, rIN)
+     TCGTemp *next_copy;
- C_O1_I2(r, r, ri)
+     uint64_t val;
- C_O1_I2(r, rZ, rZ)
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
-+C_O1_I2(w, 0, w)
++    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
- C_O1_I2(w, w, w)
+ } TempOptInfo;
- C_O1_I2(w, w, wO)
- C_O1_I2(w, w, wV)
+ typedef struct OptContext {
-diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-index XXXXXXX..XXXXXXX 100644
+     /* In flight values from optimization. */
---- a/tcg/arm/tcg-target.opc.h
+     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
-+++ b/tcg/arm/tcg-target.opc.h
+     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-@@ -XXX,XX +XXX,XX @@
++    uint64_t s_mask;  /* mask of clrsb(value) bits */
-  * consider these to be UNSPEC with names.
+     TCGType type;
-  */
+ } OptContext;
-+DEF(arm_sli_vec, 1, 2, 1, IMPLVEC)
++/* Calculate the smask for a specific value. */
- DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC)
++static uint64_t smask_from_value(uint64_t value)
- DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC)
++{
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++    int rep = clrsb64(value);
-index XXXXXXX..XXXXXXX 100644
++    return ~(~0ull >> rep);
---- a/tcg/arm/tcg-target.c.inc
++}
-+++ b/tcg/arm/tcg-target.c.inc
++
-@@ -XXX,XX +XXX,XX @@ typedef enum {
++/*
-     INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
++ * Calculate the smask for a given set of known-zeros.
-     INSN_VSARI     = 0xf2800010,  /* VSHR.S */
++ * If there are lots of zeros on the left, we can consider the remainder
-     INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
++ * an unsigned field, and thus the corresponding signed field is one bit
-+    INSN_VSLI      = 0xf3800510,
++ * larger.
-     INSN_VSHL_S    = 0xf2000400,  /* VSHL.S (register) */
++ */
-     INSN_VSHL_U    = 0xf3000400,  /* VSHL.U (register) */
++static uint64_t smask_from_zmask(uint64_t zmask)
++{
-@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
++    /*
-     case INDEX_op_arm_sshl_vec:
++     * Only the 0 bits are significant for zmask, thus the msb itself
-     case INDEX_op_arm_ushl_vec:
++     * must be zero, else we have no sign information.
-         return C_O1_I2(w, w, w);
++     */
-+    case INDEX_op_arm_sli_vec:
++    int rep = clz64(zmask);
-+        return C_O1_I2(w, 0, w);
++    if (rep == 0) {
-     case INDEX_op_or_vec:
++        return 0;
-     case INDEX_op_andc_vec:
++    }
-         return C_O1_I2(w, w, wO);
++    rep -= 1;
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
++    return ~(~0ull >> rep);
-     case INDEX_op_sari_vec:
++}
-         tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2);
++
-         return;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-+    case INDEX_op_arm_sli_vec:
+ {
-+        tcg_out_vshifti(s, INSN_VSLI, q, a0, a2, args[3] + (8 << vece));
+     return ts->state_ptr;
-+        return;
+@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
+     ti->prev_copy = ts;
-     case INDEX_op_andc_vec:
+     ti->is_const = false;
-         if (!const_args[2]) {
+     ti->z_mask = -1;
-@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
++    ti->s_mask = 0;
-     case INDEX_op_shlv_vec:
+ }
-     case INDEX_op_shrv_vec:
-     case INDEX_op_sarv_vec:
+ static void reset_temp(TCGArg arg)
-+    case INDEX_op_rotli_vec:
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-         return -1;
+         ti->is_const = true;
-     default:
+         ti->val = ts->val;
-         return 0;
+         ti->z_mask = ts->val;
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
++        ti->s_mask = smask_from_value(ts->val);
-         tcg_temp_free_vec(t1);
+     } else {
-         break;
+         ti->is_const = false;
+         ti->z_mask = -1;
-+    case INDEX_op_rotli_vec:
++        ti->s_mask = 0;
-+        t1 = tcg_temp_new_vec(type);
+     }
-+        tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
+ }
-+        vec_gen_4(INDEX_op_arm_sli_vec, type, vece,
-+                  tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-+        tcg_temp_free_vec(t1);
+     op->args[1] = src;
-+        break;
-+
+     di->z_mask = si->z_mask;
 +    di->s_mask = si->s_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
 -        reset_temp(op->args[i]);
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        reset_ts(ts);
          /*
 -         * Save the corresponding known-zero bits mask for the
 +         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
 +    uint64_t s_mask = ctx->s_mask;
      /*
       * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      if (ctx->type == TCG_TYPE_I32) {
          a_mask = (int32_t)a_mask;
          z_mask = (int32_t)z_mask;
 +        s_mask |= MAKE_64BIT_MASK(32, 32);
          ctx->z_mask = z_mask;
 +        ctx->s_mask = s_mask;
      }
      if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask, sign;
 +    uint64_t z_mask, s_mask, sign;
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
      z_mask = arg_info(op->args[1])->z_mask;
 +
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
++    s_mask = smask_from_zmask(z_mask);
+     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+     case TCG_BSWAP_OZ:
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+         /* If the sign bit may be 1, force all the bits above to 1. */
+         if (z_mask & sign) {
+             z_mask |= sign;
++            s_mask = sign << 1;
+         }
+         break;
+     default:
+         /* The high bits are undefined: force all bits above the sign to 1. */
+         z_mask |= sign << 1;
++        s_mask = 0;
+         break;
+     }
+     ctx->z_mask = z_mask;
++    ctx->s_mask = s_mask;
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask_old, z_mask;
++    int pos = op->args[2];
++    int len = op->args[3];
+     if (arg_is_const(op->args[1])) {
+         uint64_t t;
+         t = arg_info(op->args[1])->val;
+-        t = extract64(t, op->args[2], op->args[3]);
++        t = extract64(t, pos, len);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+     z_mask_old = arg_info(op->args[1])->z_mask;
+-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+-    if (op->args[2] == 0) {
++    z_mask = extract64(z_mask_old, pos, len);
++    if (pos == 0) {
+         ctx->a_mask = z_mask_old ^ z_mask;
+     }
+     ctx->z_mask = z_mask;
++    ctx->s_mask = smask_from_zmask(z_mask);
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z_mask_old, z_mask, sign;
++    uint64_t s_mask_old, s_mask, z_mask, sign;
+     bool type_change = false;
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
++    z_mask = arg_info(op->args[1])->z_mask;
++    s_mask = arg_info(op->args[1])->s_mask;
++    s_mask_old = s_mask;
+     switch (op->opc) {
+     CASE_OP_32_64(ext8s):
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
+     if (z_mask & sign) {
+         z_mask |= sign;
+-    } else if (!type_change) {
+-        ctx->a_mask = z_mask_old ^ z_mask;
+     }
++    s_mask |= sign << 1;
++
+     ctx->z_mask = z_mask;
++    ctx->s_mask = s_mask;
++    if (!type_change) {
++        ctx->a_mask = s_mask & ~s_mask_old;
++    }
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
+     }
+     ctx->z_mask = z_mask;
++    ctx->s_mask = smask_from_zmask(z_mask);
+     if (!type_change) {
+         ctx->a_mask = z_mask_old ^ z_mask;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+     MemOp mop = get_memop(oi);
+     int width = 8 * memop_size(mop);
+-    if (!(mop & MO_SIGN) && width < 64) {
+-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
++    if (width < 64) {
++        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
++        if (!(mop & MO_SIGN)) {
++            ctx->z_mask = MAKE_64BIT_MASK(0, width);
++            ctx->s_mask <<= 1;
++        }
+     }
+     /* Opcodes that touch guest memory stop the mb optimization.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ {
+-    int64_t z_mask_old, z_mask;
++    uint64_t z_mask, s_mask, s_mask_old;
++    int pos = op->args[2];
++    int len = op->args[3];
+     if (arg_is_const(op->args[1])) {
+         uint64_t t;
+         t = arg_info(op->args[1])->val;
+-        t = sextract64(t, op->args[2], op->args[3]);
++        t = sextract64(t, pos, len);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    z_mask_old = arg_info(op->args[1])->z_mask;
+-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+-    if (op->args[2] == 0 && z_mask >= 0) {
+-        ctx->a_mask = z_mask_old ^ z_mask;
+-    }
++    z_mask = arg_info(op->args[1])->z_mask;
++    z_mask = sextract64(z_mask, pos, len);
+     ctx->z_mask = z_mask;
++    s_mask_old = arg_info(op->args[1])->s_mask;
++    s_mask = sextract64(s_mask_old, pos, len);
++    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
++    ctx->s_mask = s_mask;
++
++    if (pos == 0) {
++        ctx->a_mask = s_mask & ~s_mask_old;
++    }
++
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+ {
+     /* We can't do any folding with a load, but we can record bits. */
+     switch (op->opc) {
++    CASE_OP_32_64(ld8s):
++        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
++        break;
+     CASE_OP_32_64(ld8u):
+         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
++        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
++        break;
++    CASE_OP_32_64(ld16s):
++        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
+         break;
+     CASE_OP_32_64(ld16u):
+         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
++        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
++        break;
++    case INDEX_op_ld32s_i64:
++        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
+         break;
+     case INDEX_op_ld32u_i64:
+         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
++        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
+         break;
+     default:
+         g_assert_not_reached();
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             ctx.type = TCG_TYPE_I32;
+         }
+-        /* Assume all bits affected, and no bits known zero. */
++        /* Assume all bits affected, no bits known zero, no sign reps. */
+         ctx.a_mask = -1;
+         ctx.z_mask = -1;
++        ctx.s_mask = 0;
+         /*
+          * Process each opcode.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             done = fold_extu(&ctx, op);
+             break;
++        CASE_OP_32_64(ld8s):
+         CASE_OP_32_64(ld8u):
++        CASE_OP_32_64(ld16s):
+         CASE_OP_32_64(ld16u):
++        case INDEX_op_ld32s_i64:
+         case INDEX_op_ld32u_i64:
+             done = fold_tcg_ld(&ctx, op);
+             break;
 --
 .25.1

-[PULL 01/15] tcg: Change parameters for tcg_target_const_match
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-Change the return value to bool, because that's what is should
+Sign repetitions are perforce all identical, whether they are 1 or 0.
-have been from the start.  Pass the ct mask instead of the whole
+Bitwise operations preserve the relative quantity of the repetitions.
 TCGArgConstraint, as that's the only part that's relevant.
-Change the value argument to int64_t.  We will need the extra
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-width for 32-bit hosts wanting to match vector constants.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c                    | 5 ++---
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
- tcg/aarch64/tcg-target.c.inc | 5 +----
+file changed, 29 insertions(+)
  tcg/arm/tcg-target.c.inc     | 5 +----
  tcg/i386/tcg-target.c.inc    | 4 +---
  tcg/mips/tcg-target.c.inc    | 5 +----
  tcg/ppc/tcg-target.c.inc     | 4 +---
  tcg/riscv/tcg-target.c.inc   | 4 +---
  tcg/s390/tcg-target.c.inc    | 5 +----
  tcg/sparc/tcg-target.c.inc   | 5 +----
  tcg/tci/tcg-target.c.inc     | 6 ++----
 files changed, 12 insertions(+), 36 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
- static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
+     z2 = arg_info(op->args[2])->z_mask;
-                         TCGReg base, intptr_t ofs);
+     ctx->z_mask = z1 & z2;
- static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
--static int tcg_target_const_match(tcg_target_long val, TCGType type,
++    /*
--                                  const TCGArgConstraint *arg_ct);
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
++     * Bitwise operations preserve the relative quantity of the repetitions.
- #ifdef TCG_TARGET_NEED_LDST_LABELS
++     */
- static int tcg_out_ldst_finalize(TCGContext *s);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
- #endif
++                & arg_info(op->args[2])->s_mask;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
++
-         ts = arg_temp(arg);
+     /*
+      * Known-zeros does not imply known-ones.  Therefore unless
-         if (ts->val_type == TEMP_VAL_CONST
+      * arg2 is constant, we can't infer affected bits from it.
--            && tcg_target_const_match(ts->val, ts->type, arg_ct)) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 +            && tcg_target_const_match(ts->val, ts->type, arg_ct->ct)) {
              /* constant is OK for instruction */
              const_args[i] = 1;
              new_args[i] = ts->val;
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
      }
+     ctx->z_mask = z1;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
++                & arg_info(op->args[2])->s_mask;
+     return fold_masks(ctx, op);
  }
--static int tcg_target_const_match(tcg_target_long val, TCGType type,
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
--                                  const TCGArgConstraint *arg_ct)
+         fold_xi_to_not(ctx, op, 0)) {
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+         return true;
  {
 -    int ct = arg_ct->ct;
 -
      if (ct & TCG_CT_CONST) {
          return 1;
      }
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++
-index XXXXXXX..XXXXXXX 100644
++    ctx->s_mask = arg_info(op->args[1])->s_mask
---- a/tcg/arm/tcg-target.c.inc
++                & arg_info(op->args[2])->s_mask;
-+++ b/tcg/arm/tcg-target.c.inc
+     return false;
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
   * mov operand2:     values represented with x << (2 * y), x < 0x100
   * add, sub, eor...: ditto
   */
 -static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 -                                         const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct;
 -    ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      } else if ((ct & TCG_CT_CONST_ARM) && check_fit_imm(val)) {
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
  }
- /* test if a constant matches the constraint */
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
--static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
--                                         const TCGArgConstraint *arg_ct)
+     ctx->z_mask = arg_info(op->args[3])->z_mask
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+                 | arg_info(op->args[4])->z_mask;
- {
++    ctx->s_mask = arg_info(op->args[3])->s_mask
--    int ct = arg_ct->ct;
++                & arg_info(op->args[4])->s_mask;
-     if (ct & TCG_CT_CONST) {
-         return 1;
+     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
-diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
++
-index XXXXXXX..XXXXXXX 100644
++    ctx->s_mask = arg_info(op->args[1])->s_mask
---- a/tcg/mips/tcg-target.c.inc
++                & arg_info(op->args[2])->s_mask;
-+++ b/tcg/mips/tcg-target.c.inc
+     return false;
@@ -XXX,XX +XXX,XX @@ static inline bool is_p2m1(tcg_target_long val)
  }
- /* test if a constant matches the constraint */
+@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
--static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
+         fold_xi_to_not(ctx, op, 0)) {
--                                         const TCGArgConstraint *arg_ct)
+         return true;
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+     }
- {
++
--    int ct;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--    ct = arg_ct->ct;
++                & arg_info(op->args[2])->s_mask;
-     if (ct & TCG_CT_CONST) {
+     return false;
          return 1;
      } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  }
- /* test if a constant matches the constraint */
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
--static int tcg_target_const_match(tcg_target_long val, TCGType type,
+         return true;
 -                                  const TCGArgConstraint *arg_ct)
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      }
-diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
++    ctx->s_mask = arg_info(op->args[1])->s_mask;
---- a/tcg/riscv/tcg-target.c.inc
++
-+++ b/tcg/riscv/tcg-target.c.inc
+     /* Because of fold_to_not, we want to always return true, via finish. */
-@@ -XXX,XX +XXX,XX @@ static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
+     finish_folding(ctx, op);
      return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
- /* test if a constant matches the constraint */
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
--static int tcg_target_const_match(tcg_target_long val, TCGType type,
+         fold_ix_to_not(ctx, op, 0)) {
--                                  const TCGArgConstraint *arg_ct)
+         return true;
 +static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
  {
 -    int ct = arg_ct->ct;
      if (ct & TCG_CT_CONST) {
          return 1;
      }
-diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
++
-index XXXXXXX..XXXXXXX 100644
++    ctx->s_mask = arg_info(op->args[1])->s_mask
---- a/tcg/s390/tcg-target.c.inc
++                & arg_info(op->args[2])->s_mask;
-+++ b/tcg/s390/tcg-target.c.inc
+     return false;
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
  }
- /* Test if a constant matches the constraint. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
--static int tcg_target_const_match(tcg_target_long val, TCGType type,
--                                  const TCGArgConstraint *arg_ct)
+     ctx->z_mask = arg_info(op->args[1])->z_mask
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+                 | arg_info(op->args[2])->z_mask;
- {
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--    int ct = arg_ct->ct;
++                & arg_info(op->args[2])->s_mask;
--
+     return fold_masks(ctx, op);
      if (ct & TCG_CT_CONST) {
          return 1;
      }
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
  }
- /* test if a constant matches the constraint */
--static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
--                                         const TCGArgConstraint *arg_ct)
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
- {
--    int ct = arg_ct->ct;
--
-     if (ct & TCG_CT_CONST) {
-         return 1;
-     }
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
-+++ b/tcg/tci/tcg-target.c.inc
-@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
- }
- /* Test if a constant matches the constraint. */
--static int tcg_target_const_match(tcg_target_long val, TCGType type,
--                                  const TCGArgConstraint *arg_ct)
-+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
- {
--    /* No need to return 0 or 1, 0 or != 0 is good enough. */
--    return arg_ct->ct & TCG_CT_CONST;
-+    return ct & TCG_CT_CONST;
- }
- static void tcg_target_init(TCGContext *s)
 --
 .25.1

-[PULL 07/15] tcg/arm: Implement andc, orc, abs, neg, not vector operations
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-These logical and arithmetic operations are optional, but are
+The result is either 0 or 1, which means that we have
-trivial to accomplish with the existing infrastructure.
+a 2 bit signed result, and thus 62 bits of sign.
 For clarity, use the smask_from_zmask function.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target-con-set.h |  1 +
+ tcg/optimize.c | 2 ++
- tcg/arm/tcg-target.h         | 10 +++++-----
+file changed, 2 insertions(+)
  tcg/arm/tcg-target.c.inc     | 38 ++++++++++++++++++++++++++++++++++++
 files changed, 44 insertions(+), 5 deletions(-)
-diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target-con-set.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target-con-set.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ C_O0_I4(s, s, s, s)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
- C_O1_I1(r, l)
+     }
- C_O1_I1(r, r)
- C_O1_I1(w, r)
+     ctx->z_mask = 1;
-+C_O1_I1(w, w)
++    ctx->s_mask = smask_from_zmask(1);
- C_O1_I1(w, wr)
+     return false;
- C_O1_I2(r, 0, rZ)
+ }
- C_O1_I2(r, l, l)
-diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+     }
---- a/tcg/arm/tcg-target.h
-+++ b/tcg/arm/tcg-target.h
+     ctx->z_mask = 1;
-@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
++    ctx->s_mask = smask_from_zmask(1);
- #define TCG_TARGET_HAS_v128             use_neon_instructions
+     return false;
- #define TCG_TARGET_HAS_v256             0
+  do_setcond_const:
 -#define TCG_TARGET_HAS_andc_vec         0
 -#define TCG_TARGET_HAS_orc_vec          0
 -#define TCG_TARGET_HAS_not_vec          0
 -#define TCG_TARGET_HAS_neg_vec          0
 -#define TCG_TARGET_HAS_abs_vec          0
 +#define TCG_TARGET_HAS_andc_vec         1
 +#define TCG_TARGET_HAS_orc_vec          1
 +#define TCG_TARGET_HAS_not_vec          1
 +#define TCG_TARGET_HAS_neg_vec          1
 +#define TCG_TARGET_HAS_abs_vec          1
  #define TCG_TARGET_HAS_roti_vec         0
  #define TCG_TARGET_HAS_rots_vec         0
  #define TCG_TARGET_HAS_rotv_vec         0
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
      INSN_VADD      = 0xf2000800,
      INSN_VAND      = 0xf2000110,
 +    INSN_VBIC      = 0xf2100110,
      INSN_VEOR      = 0xf3000110,
 +    INSN_VORN      = 0xf2300110,
      INSN_VORR      = 0xf2200110,
      INSN_VSUB      = 0xf3000800,
 +    INSN_VABS      = 0xf3b10300,
      INSN_VMVN      = 0xf3b00580,
 +    INSN_VNEG      = 0xf3b10380,
      INSN_VCEQ0     = 0xf3b10100,
      INSN_VCGT0     = 0xf3b10000,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
          return C_O1_I1(w, r);
      case INDEX_op_dup_vec:
          return C_O1_I1(w, wr);
 +    case INDEX_op_abs_vec:
 +    case INDEX_op_neg_vec:
 +    case INDEX_op_not_vec:
 +        return C_O1_I1(w, w);
      case INDEX_op_dup2_vec:
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
      case INDEX_op_xor_vec:
          return C_O1_I2(w, w, w);
      case INDEX_op_or_vec:
 +    case INDEX_op_andc_vec:
          return C_O1_I2(w, w, wO);
      case INDEX_op_and_vec:
 +    case INDEX_op_orc_vec:
          return C_O1_I2(w, w, wV);
      case INDEX_op_cmp_vec:
          return C_O1_I2(w, w, wZ);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_dup2_vec:
          tcg_out_dup2_vec(s, a0, a1, a2);
          return;
 +    case INDEX_op_abs_vec:
 +        tcg_out_vreg2(s, INSN_VABS, q, vece, a0, a1);
 +        return;
 +    case INDEX_op_neg_vec:
 +        tcg_out_vreg2(s, INSN_VNEG, q, vece, a0, a1);
 +        return;
 +    case INDEX_op_not_vec:
 +        tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a1);
 +        return;
      case INDEX_op_add_vec:
          tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
          return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
          return;
 +    case INDEX_op_andc_vec:
 +        if (!const_args[2]) {
 +            tcg_out_vreg3(s, INSN_VBIC, q, 0, a0, a1, a2);
 +            return;
 +        }
 +        a2 = ~a2;
 +        /* fall through */
      case INDEX_op_and_vec:
          if (const_args[2]) {
              is_shimm1632(~a2, &cmode, &imm8);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2);
          return;
 +    case INDEX_op_orc_vec:
 +        if (!const_args[2]) {
 +            tcg_out_vreg3(s, INSN_VORN, q, 0, a0, a1, a2);
 +            return;
 +        }
 +        a2 = ~a2;
 +        /* fall through */
      case INDEX_op_or_vec:
          if (const_args[2]) {
              is_shimm1632(a2, &cmode, &imm8);
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
      case INDEX_op_and_vec:
 +    case INDEX_op_andc_vec:
      case INDEX_op_or_vec:
 +    case INDEX_op_orc_vec:
      case INDEX_op_xor_vec:
 +    case INDEX_op_not_vec:
          return 1;
 +    case INDEX_op_abs_vec:
      case INDEX_op_cmp_vec:
 +    case INDEX_op_neg_vec:
          return vece < MO_64;
      default:
          return 0;
 --
 .25.1

-[PULL 12/15] tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-NEON has 3 instructions implementing this 4 argument operation,
+The results are generally 6 bit unsigned values, though
-with each insn overlapping a different logical input onto the
+the count leading and trailing bits may produce any value
-destination register.
+for a zero input.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target-con-set.h |  1 +
+ tcg/optimize.c | 3 ++-
- tcg/arm/tcg-target.h         |  2 +-
+file changed, 2 insertions(+), 1 deletion(-)
  tcg/arm/tcg-target.c.inc     | 22 ++++++++++++++++++++--
 files changed, 22 insertions(+), 3 deletions(-)
-diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target-con-set.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target-con-set.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ C_O1_I2(w, w, w)
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
- C_O1_I2(w, w, wO)
+         g_assert_not_reached();
- C_O1_I2(w, w, wV)
+     }
- C_O1_I2(w, w, wZ)
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +C_O1_I3(w, w, w, w)
  C_O1_I4(r, r, r, rI, rI)
  C_O1_I4(r, r, rIN, rIK, 0)
  C_O2_I1(r, r, l)
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
  #define TCG_TARGET_HAS_mul_vec          1
  #define TCG_TARGET_HAS_sat_vec          1
  #define TCG_TARGET_HAS_minmax_vec       1
 -#define TCG_TARGET_HAS_bitsel_vec       0
 +#define TCG_TARGET_HAS_bitsel_vec       1
  #define TCG_TARGET_HAS_cmpsel_vec       0
  #define TCG_TARGET_DEFAULT_MO (0)
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
      INSN_VSARI     = 0xf2800010,  /* VSHR.S */
      INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
 +    INSN_VBSL      = 0xf3100110,
 +    INSN_VBIT      = 0xf3200110,
 +    INSN_VBIF      = 0xf3300110,
 +
      INSN_VTST      = 0xf2000810,
      INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
          return C_O1_I2(w, w, wV);
      case INDEX_op_cmp_vec:
          return C_O1_I2(w, w, wZ);
 -
-+    case INDEX_op_bitsel_vec:
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
-+        return C_O1_I3(w, w, w, w);
+     return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
- {
+     return false;
-     TCGType type = vecl + TCG_TYPE_V64;
+ }
-     unsigned q = vecl;
 -    TCGArg a0, a1, a2;
 +    TCGArg a0, a1, a2, a3;
      int cmode, imm8;
      a0 = args[0];
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          }
          return;
 +    case INDEX_op_bitsel_vec:
 +        a3 = args[3];
 +        if (a0 == a3) {
 +            tcg_out_vreg3(s, INSN_VBIT, q, 0, a0, a2, a1);
 +        } else if (a0 == a2) {
 +            tcg_out_vreg3(s, INSN_VBIF, q, 0, a0, a3, a1);
 +        } else {
 +            tcg_out_mov(s, type, a0, a1);
 +            tcg_out_vreg3(s, INSN_VBSL, q, 0, a0, a2, a3);
 +        }
 +        return;
 +
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
      case INDEX_op_sssub_vec:
      case INDEX_op_usadd_vec:
      case INDEX_op_ussub_vec:
 +    case INDEX_op_bitsel_vec:
          return 1;
      case INDEX_op_abs_vec:
      case INDEX_op_cmp_vec:
 --
 .25.1

-[PULL 06/15] tcg/arm: Implement minimal vector operations
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-Implementing dup2, add, sub, and, or, xor as the minimal set.
+For constant shifts, we can simply shift the s_mask.
 This allows us to actually enable neon in the header file.
-Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
+For variable shifts, we know that sar does not reduce
 the s_mask, which helps for sequences like
     ext32s_i64  t, in
     sar_i64     t, t, v
     ext32s_i64  out, t
 allowing the final extend to be eliminated.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target-con-set.h |   3 +
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
- tcg/arm/tcg-target-con-str.h |   2 +
+file changed, 47 insertions(+), 3 deletions(-)
  tcg/arm/tcg-target.h         |   6 +-
  tcg/arm/tcg-target.c.inc     | 201 +++++++++++++++++++++++++++++++++--
 files changed, 204 insertions(+), 8 deletions(-)
-diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/arm/tcg-target-con-set.h
+--- a/tcg/optimize.c
-+++ b/tcg/arm/tcg-target-con-set.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIN)
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
- C_O1_I2(r, r, ri)
+     return ~(~0ull >> rep);
  C_O1_I2(r, rZ, rZ)
  C_O1_I2(w, w, w)
 +C_O1_I2(w, w, wO)
 +C_O1_I2(w, w, wV)
 +C_O1_I2(w, w, wZ)
  C_O1_I4(r, r, r, rI, rI)
  C_O1_I4(r, r, rIN, rIK, 0)
  C_O2_I1(r, r, l)
 diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target-con-str.h
 +++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@ REGS('w', ALL_VECTOR_REGS)
  CONST('I', TCG_CT_CONST_ARM)
  CONST('K', TCG_CT_CONST_INV)
  CONST('N', TCG_CT_CONST_NEG)
 +CONST('O', TCG_CT_CONST_ORRI)
 +CONST('V', TCG_CT_CONST_ANDI)
  CONST('Z', TCG_CT_CONST_ZERO)
 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.h
 +++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
  #else
  extern bool use_idiv_instructions;
  #endif
 -#define use_neon_instructions  0
 +#ifdef __ARM_NEON__
 +#define use_neon_instructions  1
 +#else
 +extern bool use_neon_instructions;
 +#endif
  /* used for function call generation */
  #define TCG_TARGET_STACK_ALIGN        8
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ int arm_arch = __ARM_ARCH;
  #ifndef use_idiv_instructions
  bool use_idiv_instructions;
  #endif
 +#ifndef use_neon_instructions
 +bool use_neon_instructions;
 +#endif
  /* ??? Ought to think about changing CONFIG_SOFTMMU to always defined.  */
  #ifdef CONFIG_SOFTMMU
@@ -XXX,XX +XXX,XX @@ typedef enum {
      /* Otherwise the assembler uses mov r0,r0 */
      INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
 +    INSN_VADD      = 0xf2000800,
 +    INSN_VAND      = 0xf2000110,
 +    INSN_VEOR      = 0xf3000110,
      INSN_VORR      = 0xf2200110,
 +    INSN_VSUB      = 0xf3000800,
 +
 +    INSN_VMVN      = 0xf3b00580,
 +
 +    INSN_VCEQ0     = 0xf3b10100,
 +    INSN_VCGT0     = 0xf3b10000,
 +    INSN_VCGE0     = 0xf3b10080,
 +    INSN_VCLE0     = 0xf3b10180,
 +    INSN_VCLT0     = 0xf3b10200,
 +
 +    INSN_VCEQ      = 0xf3000810,
 +    INSN_VCGE      = 0xf2000310,
 +    INSN_VCGT      = 0xf2000300,
 +    INSN_VCGE_U    = 0xf3000310,
 +    INSN_VCGT_U    = 0xf3000300,
 +
 +    INSN_VTST      = 0xf2000810,
      INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
      INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
  #define TCG_CT_CONST_INV  0x200
  #define TCG_CT_CONST_NEG  0x400
  #define TCG_CT_CONST_ZERO 0x800
 +#define TCG_CT_CONST_ORRI 0x1000
 +#define TCG_CT_CONST_ANDI 0x2000
  #define ALL_GENERAL_REGS  0xffffu
  #define ALL_VECTOR_REGS   0xffff0000u
@@ -XXX,XX +XXX,XX @@ static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
      return i;
  }
-+/* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
++/*
-+static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
++ * Recreate a properly left-aligned smask after manipulation.
 + * Some bit-shuffling, particularly shifts and rotates, may
 + * retain sign bits on the left, but may scatter disconnected
 + * sign bits on the right.  Retain only what remains to the left.
 + */
 +static uint64_t smask_from_smask(int64_t smask)
 +{
-+    if (v32 == deposit32(v32, 16, 16, v32)) {
++    /* Only the 1 bits are significant for smask */
-+        return is_shimm16(v32, cmode, imm8);
++    return smask_from_zmask(~smask);
 +    } else {
 +        return is_shimm32(v32, cmode, imm8);
 +    }
 +}
 +
- /* Test if a constant matches the constraint.
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-  * TODO: define constraints for:
+ {
-  *
+     return ts->state_ptr;
-@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-         return 1;
-     } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
-         return 1;
+ {
--    } else {
++    uint64_t s_mask, z_mask, sign;
--        return 0;
++
      if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +
      if (arg_is_const(op->args[2])) {
 -        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 -                                          arg_info(op->args[1])->z_mask,
 -                                          arg_info(op->args[2])->val);
 +        int sh = arg_info(op->args[2])->val;
 +
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 +
 +        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 +        ctx->s_mask = smask_from_smask(s_mask);
 +
          return fold_masks(ctx, op);
      }
 +
-+    switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
++    switch (op->opc) {
-+    case 0:
++    CASE_OP_32_64(sar):
 +        /*
 +         * Arithmetic right shift will not reduce the number of
 +         * input sign repetitions.
 +         */
 +        ctx->s_mask = s_mask;
 +        break;
-+    case TCG_CT_CONST_ANDI:
++    CASE_OP_32_64(shr):
-+        val = ~val;
++        /*
-+        /* fallthru */
++         * If the sign bit is known zero, then logical right shift
-+    case TCG_CT_CONST_ORRI:
++         * will not reduced the number of input sign repetitions.
-+        if (val == deposit64(val, 32, 32, val)) {
++         */
-+            int cmode, imm8;
++        sign = (s_mask & -s_mask) >> 1;
-+            return is_shimm1632(val, &cmode, &imm8);
++        if (!(z_mask & sign)) {
 +            ctx->s_mask = s_mask;
 +        }
 +        break;
 +    default:
-+        /* Both bits should not be set for the same insn.  */
++        break;
 +        g_assert_not_reached();
 +    }
 +
-+    return 0;
+     return false;
  }
- static inline void tcg_out_b(TCGContext *s, int cond, int32_t offset)
-@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vm(TCGReg rm)
-     return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
- }
-+static void tcg_out_vreg2(TCGContext *s, ARMInsn insn, int q, int vece,
-+                          TCGReg d, TCGReg m)
-+{
-+    tcg_out32(s, insn | (vece << 18) | (q << 6) |
-+              encode_vd(d) | encode_vm(m));
-+}
-+
- static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
-                           TCGReg d, TCGReg n, TCGReg m)
- {
-@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
-     case INDEX_op_add_vec:
-     case INDEX_op_sub_vec:
-     case INDEX_op_xor_vec:
--    case INDEX_op_or_vec:
--    case INDEX_op_and_vec:
--    case INDEX_op_cmp_vec:
-         return C_O1_I2(w, w, w);
-+    case INDEX_op_or_vec:
-+        return C_O1_I2(w, w, wO);
-+    case INDEX_op_and_vec:
-+        return C_O1_I2(w, w, wV);
-+    case INDEX_op_cmp_vec:
-+        return C_O1_I2(w, w, wZ);
-     default:
-         g_assert_not_reached();
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
-     }
- }
-+static const ARMInsn vec_cmp_insn[16] = {
-+    [TCG_COND_EQ] = INSN_VCEQ,
-+    [TCG_COND_GT] = INSN_VCGT,
-+    [TCG_COND_GE] = INSN_VCGE,
-+    [TCG_COND_GTU] = INSN_VCGT_U,
-+    [TCG_COND_GEU] = INSN_VCGE_U,
-+};
-+
-+static const ARMInsn vec_cmp0_insn[16] = {
-+    [TCG_COND_EQ] = INSN_VCEQ0,
-+    [TCG_COND_GT] = INSN_VCGT0,
-+    [TCG_COND_GE] = INSN_VCGE0,
-+    [TCG_COND_LT] = INSN_VCLT0,
-+    [TCG_COND_LE] = INSN_VCLE0,
-+};
-+
- static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-                            unsigned vecl, unsigned vece,
-                            const TCGArg *args, const int *const_args)
- {
--    g_assert_not_reached();
-+    TCGType type = vecl + TCG_TYPE_V64;
-+    unsigned q = vecl;
-+    TCGArg a0, a1, a2;
-+    int cmode, imm8;
-+
-+    a0 = args[0];
-+    a1 = args[1];
-+    a2 = args[2];
-+
-+    switch (opc) {
-+    case INDEX_op_ld_vec:
-+        tcg_out_ld(s, type, a0, a1, a2);
-+        return;
-+    case INDEX_op_st_vec:
-+        tcg_out_st(s, type, a0, a1, a2);
-+        return;
-+    case INDEX_op_dupm_vec:
-+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
-+        return;
-+    case INDEX_op_dup2_vec:
-+        tcg_out_dup2_vec(s, a0, a1, a2);
-+        return;
-+    case INDEX_op_add_vec:
-+        tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
-+        return;
-+    case INDEX_op_sub_vec:
-+        tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
-+        return;
-+    case INDEX_op_xor_vec:
-+        tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
-+        return;
-+
-+    case INDEX_op_and_vec:
-+        if (const_args[2]) {
-+            is_shimm1632(~a2, &cmode, &imm8);
-+            if (a0 == a1) {
-+                tcg_out_vmovi(s, a0, q, 1, cmode | 1, imm8); /* VBICI */
-+                return;
-+            }
-+            tcg_out_vmovi(s, a0, q, 1, cmode, imm8); /* VMVNI */
-+            a2 = a0;
-+        }
-+        tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2);
-+        return;
-+
-+    case INDEX_op_or_vec:
-+        if (const_args[2]) {
-+            is_shimm1632(a2, &cmode, &imm8);
-+            if (a0 == a1) {
-+                tcg_out_vmovi(s, a0, q, 0, cmode | 1, imm8); /* VORRI */
-+                return;
-+            }
-+            tcg_out_vmovi(s, a0, q, 0, cmode, imm8); /* VMOVI */
-+            a2 = a0;
-+        }
-+        tcg_out_vreg3(s, INSN_VORR, q, 0, a0, a1, a2);
-+        return;
-+
-+    case INDEX_op_cmp_vec:
-+        {
-+            TCGCond cond = args[3];
-+
-+            if (cond == TCG_COND_NE) {
-+                if (const_args[2]) {
-+                    tcg_out_vreg3(s, INSN_VTST, q, vece, a0, a1, a1);
-+                } else {
-+                    tcg_out_vreg3(s, INSN_VCEQ, q, vece, a0, a1, a2);
-+                    tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a0);
-+                }
-+            } else {
-+                ARMInsn insn;
-+
-+                if (const_args[2]) {
-+                    insn = vec_cmp0_insn[cond];
-+                    if (insn) {
-+                        tcg_out_vreg2(s, insn, q, vece, a0, a1);
-+                        return;
-+                    }
-+                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
-+                    a2 = TCG_VEC_TMP;
-+                }
-+                insn = vec_cmp_insn[cond];
-+                if (insn == 0) {
-+                    TCGArg t;
-+                    t = a1, a1 = a2, a2 = t;
-+                    cond = tcg_swap_cond(cond);
-+                    insn = vec_cmp_insn[cond];
-+                    tcg_debug_assert(insn != 0);
-+                }
-+                tcg_out_vreg3(s, insn, q, vece, a0, a1, a2);
-+            }
-+        }
-+        return;
-+
-+    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-+    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
-+    default:
-+        g_assert_not_reached();
-+    }
- }
- int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
- {
--    return 0;
-+    switch (opc) {
-+    case INDEX_op_add_vec:
-+    case INDEX_op_sub_vec:
-+    case INDEX_op_and_vec:
-+    case INDEX_op_or_vec:
-+    case INDEX_op_xor_vec:
-+        return 1;
-+    case INDEX_op_cmp_vec:
-+        return vece < MO_64;
-+    default:
-+        return 0;
-+    }
- }
- void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 --
 .25.1

The following changes since commit 1cbd2d914939ee6028e9688d4ba859a528c28405:

Merge remote-tracking branch 'remotes/jasowang/tags/net-pull-request' into staging (2021-06-04 13:38:49 +0100)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20210604

for you to fetch changes up to 0006039e29b9e6118beab300146f7c4931f7a217:

tcg/arm: Implement TCG_TARGET_HAS_rotv_vec (2021-06-04 11:50:11 -0700)

----------------------------------------------------------------
Host vector support for arm neon.

----------------------------------------------------------------
Richard Henderson (15):
      tcg: Change parameters for tcg_target_const_match
      tcg/arm: Add host vector framework
      tcg/arm: Implement tcg_out_ld/st for vector types
      tcg/arm: Implement tcg_out_mov for vector types
      tcg/arm: Implement tcg_out_dup*_vec
      tcg/arm: Implement minimal vector operations
      tcg/arm: Implement andc, orc, abs, neg, not vector operations
      tcg/arm: Implement TCG_TARGET_HAS_shi_vec
      tcg/arm: Implement TCG_TARGET_HAS_mul_vec
      tcg/arm: Implement TCG_TARGET_HAS_sat_vec
      tcg/arm: Implement TCG_TARGET_HAS_minmax_vec
      tcg/arm: Implement TCG_TARGET_HAS_bitsel_vec
      tcg/arm: Implement TCG_TARGET_HAS_shv_vec
      tcg/arm: Implement TCG_TARGET_HAS_roti_vec
      tcg/arm: Implement TCG_TARGET_HAS_rotv_vec

Change the return value to bool, because that's what is should
have been from the start.  Pass the ct mask instead of the whole
TCGArgConstraint, as that's the only part that's relevant.

Change the value argument to int64_t.  We will need the extra
width for 32-bit hosts wanting to match vector constants.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                    | 5 ++---
 tcg/aarch64/tcg-target.c.inc | 5 +----
 tcg/arm/tcg-target.c.inc     | 5 +----
 tcg/i386/tcg-target.c.inc    | 4 +---
 tcg/mips/tcg-target.c.inc    | 5 +----
 tcg/ppc/tcg-target.c.inc     | 4 +---
 tcg/riscv/tcg-target.c.inc   | 4 +---
 tcg/s390/tcg-target.c.inc    | 5 +----
 tcg/sparc/tcg-target.c.inc   | 5 +----
 tcg/tci/tcg-target.c.inc     | 6 ++----
 10 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                         TCGReg base, intptr_t ofs);
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct);
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 static int tcg_out_ldst_finalize(TCGContext *s);
 #endif
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         ts = arg_temp(arg);
 
         if (ts->val_type == TEMP_VAL_CONST
-            && tcg_target_const_match(ts->val, ts->type, arg_ct)) {
+            && tcg_target_const_match(ts->val, ts->type, arg_ct->ct)) {
             /* constant is OK for instruction */
             const_args[i] = 1;
             new_args[i] = ts->val;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
     }
 }
 
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
-
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
  * mov operand2:     values represented with x << (2 * y), x < 0x100
  * add, sub, eor...: ditto
  */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct;
-    ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     } else if ((ct & TCG_CT_CONST_ARM) && check_fit_imm(val)) {
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline bool is_p2m1(tcg_target_long val)
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct;
-    ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
 }
 
 /* test if a constant matches the constraint */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
 }
 
 /* test if a constant matches the constraint */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
 }
 
 /* Test if a constant matches the constraint. */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
-
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
 }
 
 /* test if a constant matches the constraint */
-static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                         const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    int ct = arg_ct->ct;
-
     if (ct & TCG_CT_CONST) {
         return 1;
     }
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
 }
 
 /* Test if a constant matches the constraint. */
-static int tcg_target_const_match(tcg_target_long val, TCGType type,
-                                  const TCGArgConstraint *arg_ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
-    /* No need to return 0 or 1, 0 or != 0 is good enough. */
-    return arg_ct->ct & TCG_CT_CONST;
+    return ct & TCG_CT_CONST;
 }
 
 static void tcg_target_init(TCGContext *s)
-- 
2.25.1

Add registers and function stubs.  The functionality
is disabled via use_neon_instructions defined to 0.

We must still include results for the mandatory opcodes in
tcg_target_op_def, as all opcodes are checked during tcg init.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |   4 ++
 tcg/arm/tcg-target-con-str.h |   1 +
 tcg/arm/tcg-target.h         |  48 ++++++++++++--
 tcg/arm/tcg-target.opc.h     |  12 ++++
 tcg/arm/tcg-target.c.inc     | 117 +++++++++++++++++++++++++++++------
 5 files changed, 158 insertions(+), 24 deletions(-)
 create mode 100644 tcg/arm/tcg-target.opc.h

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O0_I1(r)
 C_O0_I2(r, r)
 C_O0_I2(r, rIN)
 C_O0_I2(s, s)
+C_O0_I2(w, r)
 C_O0_I3(s, s, s)
 C_O0_I4(r, r, rI, rI)
 C_O0_I4(s, s, s, s)
 C_O1_I1(r, l)
 C_O1_I1(r, r)
+C_O1_I1(w, r)
+C_O1_I1(w, wr)
 C_O1_I2(r, 0, rZ)
 C_O1_I2(r, l, l)
 C_O1_I2(r, r, r)
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIK)
 C_O1_I2(r, r, rIN)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, rZ, rZ)
+C_O1_I2(w, w, w)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
 C_O2_I1(r, r, l)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@
 REGS('r', ALL_GENERAL_REGS)
 REGS('l', ALL_QLOAD_REGS)
 REGS('s', ALL_QSTORE_REGS)
+REGS('w', ALL_VECTOR_REGS)
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
     TCG_REG_R13,
     TCG_REG_R14,
     TCG_REG_PC,
+
+    TCG_REG_Q0,
+    TCG_REG_Q1,
+    TCG_REG_Q2,
+    TCG_REG_Q3,
+    TCG_REG_Q4,
+    TCG_REG_Q5,
+    TCG_REG_Q6,
+    TCG_REG_Q7,
+    TCG_REG_Q8,
+    TCG_REG_Q9,
+    TCG_REG_Q10,
+    TCG_REG_Q11,
+    TCG_REG_Q12,
+    TCG_REG_Q13,
+    TCG_REG_Q14,
+    TCG_REG_Q15,
+
+    TCG_AREG0 = TCG_REG_R6,
+    TCG_REG_CALL_STACK = TCG_REG_R13,
 } TCGReg;
 
-#define TCG_TARGET_NB_REGS 16
+#define TCG_TARGET_NB_REGS 32
 
 #ifdef __ARM_ARCH_EXT_IDIV__
 #define use_idiv_instructions  1
 #else
 extern bool use_idiv_instructions;
 #endif
-
+#define use_neon_instructions  0
 
 /* used for function call generation */
-#define TCG_REG_CALL_STACK		TCG_REG_R13
 #define TCG_TARGET_STACK_ALIGN		8
 #define TCG_TARGET_CALL_ALIGN_ARGS	1
 #define TCG_TARGET_CALL_STACK_OFFSET	0
@@ -XXX,XX +XXX,XX @@ extern bool use_idiv_instructions;
 #define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_qemu_st8_i32     0
 
-enum {
-    TCG_AREG0 = TCG_REG_R6,
-};
+#define TCG_TARGET_HAS_v64              use_neon_instructions
+#define TCG_TARGET_HAS_v128             use_neon_instructions
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_andc_vec         0
+#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_abs_vec          0
+#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_rots_vec         0
+#define TCG_TARGET_HAS_rotv_vec         0
+#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          0
+#define TCG_TARGET_HAS_mul_vec          0
+#define TCG_TARGET_HAS_sat_vec          0
+#define TCG_TARGET_HAS_minmax_vec       0
+#define TCG_TARGET_HAS_bitsel_vec       0
+#define TCG_TARGET_HAS_cmpsel_vec       0
 
 #define TCG_TARGET_DEFAULT_MO (0)
 #define TCG_TARGET_HAS_MEMORY_BSWAP     1
diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/arm/tcg-target.opc.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * Copyright (c) 2019 Linaro
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ *
+ * See the COPYING file in the top-level directory for details.
+ *
+ * Target-specific opcodes for host vector expansion.  These will be
+ * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+ * consider these to be UNSPEC with names.
+ */
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ bool use_idiv_instructions;
 
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-    "%r0",
-    "%r1",
-    "%r2",
-    "%r3",
-    "%r4",
-    "%r5",
-    "%r6",
-    "%r7",
-    "%r8",
-    "%r9",
-    "%r10",
-    "%r11",
-    "%r12",
-    "%r13",
-    "%r14",
-    "%pc",
+    "%r0",  "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
+    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%sp",  "%r14", "%pc",
+    "%q0",  "%q1",  "%q2",  "%q3",  "%q4",  "%q5",  "%q6",  "%q7",
+    "%q8",  "%q9",  "%q10", "%q11", "%q12", "%q13", "%q14", "%q15",
 };
 #endif
 
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R3,
     TCG_REG_R12,
     TCG_REG_R14,
+
+    TCG_REG_Q0,
+    TCG_REG_Q1,
+    TCG_REG_Q2,
+    TCG_REG_Q3,
+    /* Q4 - Q7 are call-saved, and skipped. */
+    TCG_REG_Q8,
+    TCG_REG_Q9,
+    TCG_REG_Q10,
+    TCG_REG_Q11,
+    TCG_REG_Q12,
+    TCG_REG_Q13,
+    TCG_REG_Q14,
+    TCG_REG_Q15,
 };
 
 static const int tcg_target_call_iarg_regs[4] = {
@@ -XXX,XX +XXX,XX @@ static const int tcg_target_call_oarg_regs[2] = {
 };
 
 #define TCG_REG_TMP  TCG_REG_R12
+#define TCG_VEC_TMP  TCG_REG_Q15
 
 enum arm_cond_code_e {
     COND_EQ = 0x0,
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 #define TCG_CT_CONST_ZERO 0x800
 
 #define ALL_GENERAL_REGS  0xffffu
+#define ALL_VECTOR_REGS   0xffff0000u
 
 /*
  * r0-r2 will be overwritten when reading the tlb entry (softmmu only)
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_qemu_st_i64:
         return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
 
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
+    case INDEX_op_ld_vec:
+    case INDEX_op_dupm_vec:
+        return C_O1_I1(w, r);
+    case INDEX_op_dup_vec:
+        return C_O1_I1(w, wr);
+    case INDEX_op_dup2_vec:
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, w);
+
     default:
         g_assert_not_reached();
     }
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
 {
     /* Only probe for the platform and capabilities if we havn't already
        determined maximum values at compile time.  */
-#ifndef use_idiv_instructions
+#if !defined(use_idiv_instructions) || !defined(use_neon_instructions)
     {
         unsigned long hwcap = qemu_getauxval(AT_HWCAP);
+#ifndef use_idiv_instructions
         use_idiv_instructions = (hwcap & HWCAP_ARM_IDIVA) != 0;
+#endif
+#ifndef use_neon_instructions
+        use_neon_instructions = (hwcap & HWCAP_ARM_NEON) != 0;
+#endif
     }
 #endif
+
     if (__ARM_ARCH < 7) {
         const char *pl = (const char *)qemu_getauxval(AT_PLATFORM);
         if (pl != NULL && pl[0] == 'v' && pl[1] >= '4' && pl[1] <= '9') {
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
         }
     }
 
-    tcg_target_available_regs[TCG_TYPE_I32] = 0xffff;
+    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
 
     tcg_target_call_clobber_regs = 0;
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R0);
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R12);
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R14);
 
+    if (use_neon_instructions) {
+        tcg_target_available_regs[TCG_TYPE_V64]  = ALL_VECTOR_REGS;
+        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
+
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q0);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q1);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q2);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q3);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q8);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q9);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q10);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q11);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q12);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q13);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q14);
+        tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_Q15);
+    }
+
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_PC);
+    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
 }
 
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
     tcg_out_movi32(s, COND_AL, ret, arg);
 }
 
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                            TCGReg rd, TCGReg rs)
+{
+    g_assert_not_reached();
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, TCGReg base, intptr_t offset)
+{
+    g_assert_not_reached();
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, int64_t v64)
+{
+    g_assert_not_reached();
+}
+
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                           unsigned vecl, unsigned vece,
+                           const TCGArg *args, const int *const_args)
+{
+    g_assert_not_reached();
+}
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    return 0;
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    g_assert_not_reached();
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
     int i;
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 70 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 6 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_NOP_v6k   = 0xe320f000,
     /* Otherwise the assembler uses mov r0,r0 */
     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
+
+    INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
+    INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
 } ARMInsn;
 
 #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
@@ -XXX,XX +XXX,XX @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg *args,
     }
 }
 
+/*
+ * Note that TCGReg references Q-registers.
+ * Q-regno = 2 * D-regno, so shift left by 1 whlie inserting.
+ */
+static uint32_t encode_vd(TCGReg rd)
+{
+    tcg_debug_assert(rd >= TCG_REG_Q0);
+    return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
+}
+
+static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
+                          TCGReg rd, TCGReg rn, int offset)
+{
+    if (offset != 0) {
+        if (check_fit_imm(offset) || check_fit_imm(-offset)) {
+            tcg_out_dat_rIN(s, COND_AL, ARITH_ADD, ARITH_SUB,
+                            TCG_REG_TMP, rn, offset, true);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP, offset);
+            tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
+                            TCG_REG_TMP, TCG_REG_TMP, rn, 0);
+        }
+        rn = TCG_REG_TMP;
+    }
+    tcg_out32(s, insn | (rn << 16) | encode_vd(rd) | 0xf);
+}
+
 #ifdef CONFIG_SOFTMMU
 #include "../tcg-ldst.c.inc"
 
@@ -XXX,XX +XXX,XX @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
+    switch (type) {
+    case TCG_TYPE_I32:
+        tcg_out_ld32u(s, COND_AL, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V64:
+        /* regs 1; size 8; align 8 */
+        tcg_out_vldst(s, INSN_VLD1 | 0x7d0, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V128:
+        /* regs 2; size 8; align 16 */
+        tcg_out_vldst(s, INSN_VLD1 | 0xae0, arg, arg1, arg2);
+        return;
+    default:
+        g_assert_not_reached();
+    }
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    tcg_out_st32(s, COND_AL, arg, arg1, arg2);
+    switch (type) {
+    case TCG_TYPE_I32:
+        tcg_out_st32(s, COND_AL, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V64:
+        /* regs 1; size 8; align 8 */
+        tcg_out_vldst(s, INSN_VST1 | 0x7d0, arg, arg1, arg2);
+        return;
+    case TCG_TYPE_V128:
+        /* regs 2; size 8; align 16 */
+        tcg_out_vldst(s, INSN_VST1 | 0xae0, arg, arg1, arg2);
+        return;
+    default:
+        g_assert_not_reached();
+    }
 }
 
 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 52 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     /* Otherwise the assembler uses mov r0,r0 */
     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
 
+    INSN_VORR      = 0xf2200110,
+
     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
 } ARMInsn;
@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vd(TCGReg rd)
     return (extract32(rd, 3, 1) << 22) | (extract32(rd, 0, 3) << 13);
 }
 
+static uint32_t encode_vn(TCGReg rn)
+{
+    tcg_debug_assert(rn >= TCG_REG_Q0);
+    return (extract32(rn, 3, 1) << 7) | (extract32(rn, 0, 3) << 17);
+}
+
+static uint32_t encode_vm(TCGReg rm)
+{
+    tcg_debug_assert(rm >= TCG_REG_Q0);
+    return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
+}
+
+static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
+                          TCGReg d, TCGReg n, TCGReg m)
+{
+    tcg_out32(s, insn | (vece << 20) | (q << 6) |
+              encode_vd(d) | encode_vn(n) | encode_vm(m));
+}
+
 static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                           TCGReg rd, TCGReg rn, int offset)
 {
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
     return false;
 }
 
-static inline bool tcg_out_mov(TCGContext *s, TCGType type,
-                               TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    tcg_out_mov_reg(s, COND_AL, ret, arg);
-    return true;
+    if (ret == arg) {
+        return true;
+    }
+    switch (type) {
+    case TCG_TYPE_I32:
+        if (ret < TCG_REG_Q0 && arg < TCG_REG_Q0) {
+            tcg_out_mov_reg(s, COND_AL, ret, arg);
+            return true;
+        }
+        return false;
+
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+        /* "VMOV D,N" is an alias for "VORR D,N,N". */
+        tcg_out_vreg3(s, INSN_VORR, type - TCG_TYPE_V64, 0, ret, arg, arg);
+        return true;
+
+    default:
+        g_assert_not_reached();
+    }
 }
 
-static inline void tcg_out_movi(TCGContext *s, TCGType type,
-                                TCGReg ret, tcg_target_long arg)
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         TCGReg ret, tcg_target_long arg)
 {
+    tcg_debug_assert(type == TCG_TYPE_I32);
+    tcg_debug_assert(ret < TCG_REG_Q0);
     tcg_out_movi32(s, COND_AL, ret, arg);
 }
 
-- 
2.25.1

Most of dupi is copied from tcg/aarch64, which has the same
encoding for AdvSimdExpandImm.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 283 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 275 insertions(+), 8 deletions(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
 
     INSN_VORR      = 0xf2200110,
 
+    INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
+    INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
+    INSN_VLDR_D    = 0xed100b00,  /* VLDR.64 */
     INSN_VLD1      = 0xf4200000,  /* VLD1 (multiple single elements) */
+    INSN_VLD1R     = 0xf4a00c00,  /* VLD1 (single element to all lanes) */
     INSN_VST1      = 0xf4000000,  /* VST1 (multiple single elements) */
+    INSN_VMOVI     = 0xf2800010,  /* VMOV (immediate) */
 } ARMInsn;
 
 #define INSN_NOP   (use_armv7_instructions ? INSN_NOP_v6k : INSN_NOP_v4)
@@ -XXX,XX +XXX,XX @@ static const uint8_t tcg_cond_to_arm_cond[] = {
     [TCG_COND_GTU] = COND_HI,
 };
 
+static int encode_imm(uint32_t imm);
+
+/* TCG private relocation type: add with pc+imm8 */
+#define R_ARM_PC8  11
+
+/* TCG private relocation type: vldr with imm8 << 2 */
+#define R_ARM_PC11 12
+
 static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
 {
     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
@@ -XXX,XX +XXX,XX @@ static bool reloc_pc13(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
     return false;
 }
 
+static bool reloc_pc11(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+{
+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+    ptrdiff_t offset = (tcg_ptr_byte_diff(target, src_rx) - 8) / 4;
+
+    if (offset >= -0xff && offset <= 0xff) {
+        tcg_insn_unit insn = *src_rw;
+        bool u = (offset >= 0);
+        if (!u) {
+            offset = -offset;
+        }
+        insn = deposit32(insn, 23, 1, u);
+        insn = deposit32(insn, 0, 8, offset);
+        *src_rw = insn;
+        return true;
+    }
+    return false;
+}
+
+static bool reloc_pc8(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+{
+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+    ptrdiff_t offset = tcg_ptr_byte_diff(target, src_rx) - 8;
+    int rot = encode_imm(offset);
+
+    if (rot >= 0) {
+        *src_rw = deposit32(*src_rw, 0, 12, rol32(offset, rot) | (rot << 7));
+        return true;
+    }
+    return false;
+}
+
 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
     tcg_debug_assert(addend == 0);
-
-    if (type == R_ARM_PC24) {
+    switch (type) {
+    case R_ARM_PC24:
         return reloc_pc24(code_ptr, (const tcg_insn_unit *)value);
-    } else if (type == R_ARM_PC13) {
+    case R_ARM_PC13:
         return reloc_pc13(code_ptr, (const tcg_insn_unit *)value);
-    } else {
+    case R_ARM_PC11:
+        return reloc_pc11(code_ptr, (const tcg_insn_unit *)value);
+    case R_ARM_PC8:
+        return reloc_pc8(code_ptr, (const tcg_insn_unit *)value);
+    default:
         g_assert_not_reached();
     }
 }
@@ -XXX,XX +XXX,XX @@ static inline uint32_t rotl(uint32_t val, int n)
 
 /* ARM immediates for ALU instructions are made of an unsigned 8-bit
    right-rotated by an even amount between 0 and 30. */
-static inline int encode_imm(uint32_t imm)
+static int encode_imm(uint32_t imm)
 {
     int shift;
 
@@ -XXX,XX +XXX,XX @@ static inline int check_fit_imm(uint32_t imm)
     return encode_imm(imm) >= 0;
 }
 
+/* Return true if v16 is a valid 16-bit shifted immediate.  */
+static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
+{
+    if (v16 == (v16 & 0xff)) {
+        *cmode = 0x8;
+        *imm8 = v16 & 0xff;
+        return true;
+    } else if (v16 == (v16 & 0xff00)) {
+        *cmode = 0xa;
+        *imm8 = v16 >> 8;
+        return true;
+    }
+    return false;
+}
+
+/* Return true if v32 is a valid 32-bit shifted immediate.  */
+static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
+{
+    if (v32 == (v32 & 0xff)) {
+        *cmode = 0x0;
+        *imm8 = v32 & 0xff;
+        return true;
+    } else if (v32 == (v32 & 0xff00)) {
+        *cmode = 0x2;
+        *imm8 = (v32 >> 8) & 0xff;
+        return true;
+    } else if (v32 == (v32 & 0xff0000)) {
+        *cmode = 0x4;
+        *imm8 = (v32 >> 16) & 0xff;
+        return true;
+    } else if (v32 == (v32 & 0xff000000)) {
+        *cmode = 0x6;
+        *imm8 = v32 >> 24;
+        return true;
+    }
+    return false;
+}
+
+/* Return true if v32 is a valid 32-bit shifting ones immediate.  */
+static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
+{
+    if ((v32 & 0xffff00ff) == 0xff) {
+        *cmode = 0xc;
+        *imm8 = (v32 >> 8) & 0xff;
+        return true;
+    } else if ((v32 & 0xff00ffff) == 0xffff) {
+        *cmode = 0xd;
+        *imm8 = (v32 >> 16) & 0xff;
+        return true;
+    }
+    return false;
+}
+
+/*
+ * Return non-zero if v32 can be formed by MOVI+ORR.
+ * Place the parameters for MOVI in (cmode, imm8).
+ * Return the cmode for ORR; the imm8 can be had via extraction from v32.
+ */
+static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
+{
+    int i;
+
+    for (i = 6; i > 0; i -= 2) {
+        /* Mask out one byte we can add with ORR.  */
+        uint32_t tmp = v32 & ~(0xffu << (i * 4));
+        if (is_shimm32(tmp, cmode, imm8) ||
+            is_soimm32(tmp, cmode, imm8)) {
+            break;
+        }
+    }
+    return i;
+}
+
 /* Test if a constant matches the constraint.
  * TODO: define constraints for:
  *
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
               encode_vd(d) | encode_vn(n) | encode_vm(m));
 }
 
+static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
+                          int q, int op, int cmode, uint8_t imm8)
+{
+    tcg_out32(s, INSN_VMOVI | encode_vd(rd) | (q << 6) | (op << 5)
+              | (cmode << 8) | extract32(imm8, 0, 4)
+              | (extract32(imm8, 4, 3) << 16)
+              | (extract32(imm8, 7, 1) << 24));
+}
+
 static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                           TCGReg rd, TCGReg rn, int offset)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     tcg_out_movi32(s, COND_AL, ret, arg);
 }
 
+/* Type is always V128, with I64 elements.  */
+static void tcg_out_dup2_vec(TCGContext *s, TCGReg rd, TCGReg rl, TCGReg rh)
+{
+    /* Move high element into place first. */
+    /* VMOV Dd+1, Ds */
+    tcg_out_vreg3(s, INSN_VORR | (1 << 12), 0, 0, rd, rh, rh);
+    /* Move low element into place; tcg_out_mov will check for nop. */
+    tcg_out_mov(s, TCG_TYPE_V64, rd, rl);
+}
+
 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg rd, TCGReg rs)
 {
-    g_assert_not_reached();
+    int q = type - TCG_TYPE_V64;
+
+    if (vece == MO_64) {
+        if (type == TCG_TYPE_V128) {
+            tcg_out_dup2_vec(s, rd, rs, rs);
+        } else {
+            tcg_out_mov(s, TCG_TYPE_V64, rd, rs);
+        }
+    } else if (rs < TCG_REG_Q0) {
+        int b = (vece == MO_8);
+        int e = (vece == MO_16);
+        tcg_out32(s, INSN_VDUP_G | (b << 22) | (q << 21) | (e << 5) |
+                  encode_vn(rd) | (rs << 12));
+    } else {
+        int imm4 = 1 << vece;
+        tcg_out32(s, INSN_VDUP_S | (imm4 << 16) | (q << 6) |
+                  encode_vd(rd) | encode_vm(rs));
+    }
+    return true;
 }
 
 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg rd, TCGReg base, intptr_t offset)
 {
-    g_assert_not_reached();
+    if (vece == MO_64) {
+        tcg_out_ld(s, TCG_TYPE_V64, rd, base, offset);
+        if (type == TCG_TYPE_V128) {
+            tcg_out_dup2_vec(s, rd, rd, rd);
+        }
+    } else {
+        int q = type - TCG_TYPE_V64;
+        tcg_out_vldst(s, INSN_VLD1R | (vece << 6) | (q << 5),
+                      rd, base, offset);
+    }
+    return true;
 }
 
 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg rd, int64_t v64)
 {
-    g_assert_not_reached();
+    int q = type - TCG_TYPE_V64;
+    int cmode, imm8, i;
+
+    /* Test all bytes equal first.  */
+    if (vece == MO_8) {
+        tcg_out_vmovi(s, rd, q, 0, 0xe, v64);
+        return;
+    }
+
+    /*
+     * Test all bytes 0x00 or 0xff second.  This can match cases that
+     * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
+     */
+    for (i = imm8 = 0; i < 8; i++) {
+        uint8_t byte = v64 >> (i * 8);
+        if (byte == 0xff) {
+            imm8 |= 1 << i;
+        } else if (byte != 0) {
+            goto fail_bytes;
+        }
+    }
+    tcg_out_vmovi(s, rd, q, 1, 0xe, imm8);
+    return;
+ fail_bytes:
+
+    /*
+     * Tests for various replications.  For each element width, if we
+     * cannot find an expansion there's no point checking a larger
+     * width because we already know by replication it cannot match.
+     */
+    if (vece == MO_16) {
+        uint16_t v16 = v64;
+
+        if (is_shimm16(v16, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
+            return;
+        }
+        if (is_shimm16(~v16, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
+            return;
+        }
+
+        /*
+         * Otherwise, all remaining constants can be loaded in two insns:
+         * rd = v16 & 0xff, rd |= v16 & 0xff00.
+         */
+        tcg_out_vmovi(s, rd, q, 0, 0x8, v16 & 0xff);
+        tcg_out_vmovi(s, rd, q, 0, 0xb, v16 >> 8);   /* VORRI */
+        return;
+    }
+
+    if (vece == MO_32) {
+        uint32_t v32 = v64;
+
+        if (is_shimm32(v32, &cmode, &imm8) ||
+            is_soimm32(v32, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
+            return;
+        }
+        if (is_shimm32(~v32, &cmode, &imm8) ||
+            is_soimm32(~v32, &cmode, &imm8)) {
+            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
+            return;
+        }
+
+        /*
+         * Restrict the set of constants to those we can load with
+         * two instructions.  Others we load from the pool.
+         */
+        i = is_shimm32_pair(v32, &cmode, &imm8);
+        if (i) {
+            tcg_out_vmovi(s, rd, q, 0, cmode, imm8);
+            tcg_out_vmovi(s, rd, q, 0, i | 1, extract32(v32, i * 4, 8));
+            return;
+        }
+        i = is_shimm32_pair(~v32, &cmode, &imm8);
+        if (i) {
+            tcg_out_vmovi(s, rd, q, 1, cmode, imm8);
+            tcg_out_vmovi(s, rd, q, 1, i | 1, extract32(~v32, i * 4, 8));
+            return;
+        }
+    }
+
+    /*
+     * As a last resort, load from the constant pool.
+     */
+    if (!q || vece == MO_64) {
+        new_pool_l2(s, R_ARM_PC11, s->code_ptr, 0, v64, v64 >> 32);
+        /* VLDR Dd, [pc + offset] */
+        tcg_out32(s, INSN_VLDR_D | encode_vd(rd) | (0xf << 16));
+        if (q) {
+            tcg_out_dup2_vec(s, rd, rd, rd);
+        }
+    } else {
+        new_pool_label(s, (uint32_t)v64, R_ARM_PC8, s->code_ptr, 0);
+        /* add tmp, pc, offset */
+        tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_TMP, TCG_REG_PC, 0);
+        tcg_out_dupm_vec(s, type, MO_32, rd, TCG_REG_TMP, 0);
+    }
 }
 
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
-- 
2.25.1

Implementing dup2, add, sub, and, or, xor as the minimal set.
This allows us to actually enable neon in the header file.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |   3 +
 tcg/arm/tcg-target-con-str.h |   2 +
 tcg/arm/tcg-target.h         |   6 +-
 tcg/arm/tcg-target.c.inc     | 201 +++++++++++++++++++++++++++++++++--
 4 files changed, 204 insertions(+), 8 deletions(-)

diff --git a/tcg/arm/tcg-target-con-set.h b/tcg/arm/tcg-target-con-set.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-set.h
+++ b/tcg/arm/tcg-target-con-set.h
@@ -XXX,XX +XXX,XX @@ C_O1_I2(r, r, rIN)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, rZ, rZ)
 C_O1_I2(w, w, w)
+C_O1_I2(w, w, wO)
+C_O1_I2(w, w, wV)
+C_O1_I2(w, w, wZ)
 C_O1_I4(r, r, r, rI, rI)
 C_O1_I4(r, r, rIN, rIK, 0)
 C_O2_I1(r, r, l)
diff --git a/tcg/arm/tcg-target-con-str.h b/tcg/arm/tcg-target-con-str.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target-con-str.h
+++ b/tcg/arm/tcg-target-con-str.h
@@ -XXX,XX +XXX,XX @@ REGS('w', ALL_VECTOR_REGS)
 CONST('I', TCG_CT_CONST_ARM)
 CONST('K', TCG_CT_CONST_INV)
 CONST('N', TCG_CT_CONST_NEG)
+CONST('O', TCG_CT_CONST_ORRI)
+CONST('V', TCG_CT_CONST_ANDI)
 CONST('Z', TCG_CT_CONST_ZERO)
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #else
 extern bool use_idiv_instructions;
 #endif
-#define use_neon_instructions  0
+#ifdef __ARM_NEON__
+#define use_neon_instructions  1
+#else
+extern bool use_neon_instructions;
+#endif
 
 /* used for function call generation */
 #define TCG_TARGET_STACK_ALIGN		8
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ int arm_arch = __ARM_ARCH;
 #ifndef use_idiv_instructions
 bool use_idiv_instructions;
 #endif
+#ifndef use_neon_instructions
+bool use_neon_instructions;
+#endif
 
 /* ??? Ought to think about changing CONFIG_SOFTMMU to always defined.  */
 #ifdef CONFIG_SOFTMMU
@@ -XXX,XX +XXX,XX @@ typedef enum {
     /* Otherwise the assembler uses mov r0,r0 */
     INSN_NOP_v4    = (COND_AL << 28) | ARITH_MOV,
 
+    INSN_VADD      = 0xf2000800,
+    INSN_VAND      = 0xf2000110,
+    INSN_VEOR      = 0xf3000110,
     INSN_VORR      = 0xf2200110,
+    INSN_VSUB      = 0xf3000800,
+
+    INSN_VMVN      = 0xf3b00580,
+
+    INSN_VCEQ0     = 0xf3b10100,
+    INSN_VCGT0     = 0xf3b10000,
+    INSN_VCGE0     = 0xf3b10080,
+    INSN_VCLE0     = 0xf3b10180,
+    INSN_VCLT0     = 0xf3b10200,
+
+    INSN_VCEQ      = 0xf3000810,
+    INSN_VCGE      = 0xf2000310,
+    INSN_VCGT      = 0xf2000300,
+    INSN_VCGE_U    = 0xf3000310,
+    INSN_VCGT_U    = 0xf3000300,
+
+    INSN_VTST      = 0xf2000810,
 
     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
     INSN_VDUP_S    = 0xf3b00c00,  /* VDUP (scalar) */
@@ -XXX,XX +XXX,XX @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 #define TCG_CT_CONST_INV  0x200
 #define TCG_CT_CONST_NEG  0x400
 #define TCG_CT_CONST_ZERO 0x800
+#define TCG_CT_CONST_ORRI 0x1000
+#define TCG_CT_CONST_ANDI 0x2000
 
 #define ALL_GENERAL_REGS  0xffffu
 #define ALL_VECTOR_REGS   0xffff0000u
@@ -XXX,XX +XXX,XX @@ static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
     return i;
 }
 
+/* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
+static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
+{
+    if (v32 == deposit32(v32, 16, 16, v32)) {
+        return is_shimm16(v32, cmode, imm8);
+    } else {
+        return is_shimm32(v32, cmode, imm8);
+    }
+}
+
 /* Test if a constant matches the constraint.
  * TODO: define constraints for:
  *
@@ -XXX,XX +XXX,XX @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
         return 1;
     } else if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
         return 1;
-    } else {
-        return 0;
     }
+
+    switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
+    case 0:
+        break;
+    case TCG_CT_CONST_ANDI:
+        val = ~val;
+        /* fallthru */
+    case TCG_CT_CONST_ORRI:
+        if (val == deposit64(val, 32, 32, val)) {
+            int cmode, imm8;
+            return is_shimm1632(val, &cmode, &imm8);
+        }
+        break;
+    default:
+        /* Both bits should not be set for the same insn.  */
+        g_assert_not_reached();
+    }
+
+    return 0;
 }
 
 static inline void tcg_out_b(TCGContext *s, int cond, int32_t offset)
@@ -XXX,XX +XXX,XX @@ static uint32_t encode_vm(TCGReg rm)
     return (extract32(rm, 3, 1) << 5) | (extract32(rm, 0, 3) << 1);
 }
 
+static void tcg_out_vreg2(TCGContext *s, ARMInsn insn, int q, int vece,
+                          TCGReg d, TCGReg m)
+{
+    tcg_out32(s, insn | (vece << 18) | (q << 6) |
+              encode_vd(d) | encode_vm(m));
+}
+
 static void tcg_out_vreg3(TCGContext *s, ARMInsn insn, int q, int vece,
                           TCGReg d, TCGReg n, TCGReg m)
 {
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_add_vec:
     case INDEX_op_sub_vec:
     case INDEX_op_xor_vec:
-    case INDEX_op_or_vec:
-    case INDEX_op_and_vec:
-    case INDEX_op_cmp_vec:
         return C_O1_I2(w, w, w);
+    case INDEX_op_or_vec:
+        return C_O1_I2(w, w, wO);
+    case INDEX_op_and_vec:
+        return C_O1_I2(w, w, wV);
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, wZ);
 
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
     }
 }
 
+static const ARMInsn vec_cmp_insn[16] = {
+    [TCG_COND_EQ] = INSN_VCEQ,
+    [TCG_COND_GT] = INSN_VCGT,
+    [TCG_COND_GE] = INSN_VCGE,
+    [TCG_COND_GTU] = INSN_VCGT_U,
+    [TCG_COND_GEU] = INSN_VCGE_U,
+};
+
+static const ARMInsn vec_cmp0_insn[16] = {
+    [TCG_COND_EQ] = INSN_VCEQ0,
+    [TCG_COND_GT] = INSN_VCGT0,
+    [TCG_COND_GE] = INSN_VCGE0,
+    [TCG_COND_LT] = INSN_VCLT0,
+    [TCG_COND_LE] = INSN_VCLE0,
+};
+
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            unsigned vecl, unsigned vece,
                            const TCGArg *args, const int *const_args)
 {
-    g_assert_not_reached();
+    TCGType type = vecl + TCG_TYPE_V64;
+    unsigned q = vecl;
+    TCGArg a0, a1, a2;
+    int cmode, imm8;
+
+    a0 = args[0];
+    a1 = args[1];
+    a2 = args[2];
+
+    switch (opc) {
+    case INDEX_op_ld_vec:
+        tcg_out_ld(s, type, a0, a1, a2);
+        return;
+    case INDEX_op_st_vec:
+        tcg_out_st(s, type, a0, a1, a2);
+        return;
+    case INDEX_op_dupm_vec:
+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+        return;
+    case INDEX_op_dup2_vec:
+        tcg_out_dup2_vec(s, a0, a1, a2);
+        return;
+    case INDEX_op_add_vec:
+        tcg_out_vreg3(s, INSN_VADD, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_sub_vec:
+        tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_xor_vec:
+        tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
+        return;
+
+    case INDEX_op_and_vec:
+        if (const_args[2]) {
+            is_shimm1632(~a2, &cmode, &imm8);
+            if (a0 == a1) {
+                tcg_out_vmovi(s, a0, q, 1, cmode | 1, imm8); /* VBICI */
+                return;
+            }
+            tcg_out_vmovi(s, a0, q, 1, cmode, imm8); /* VMVNI */
+            a2 = a0;
+        }
+        tcg_out_vreg3(s, INSN_VAND, q, 0, a0, a1, a2);
+        return;
+
+    case INDEX_op_or_vec:
+        if (const_args[2]) {
+            is_shimm1632(a2, &cmode, &imm8);
+            if (a0 == a1) {
+                tcg_out_vmovi(s, a0, q, 0, cmode | 1, imm8); /* VORRI */
+                return;
+            }
+            tcg_out_vmovi(s, a0, q, 0, cmode, imm8); /* VMOVI */
+            a2 = a0;
+        }
+        tcg_out_vreg3(s, INSN_VORR, q, 0, a0, a1, a2);
+        return;
+
+    case INDEX_op_cmp_vec:
+        {
+            TCGCond cond = args[3];
+
+            if (cond == TCG_COND_NE) {
+                if (const_args[2]) {
+                    tcg_out_vreg3(s, INSN_VTST, q, vece, a0, a1, a1);
+                } else {
+                    tcg_out_vreg3(s, INSN_VCEQ, q, vece, a0, a1, a2);
+                    tcg_out_vreg2(s, INSN_VMVN, q, 0, a0, a0);
+                }
+            } else {
+                ARMInsn insn;
+
+                if (const_args[2]) {
+                    insn = vec_cmp0_insn[cond];
+                    if (insn) {
+                        tcg_out_vreg2(s, insn, q, vece, a0, a1);
+                        return;
+                    }
+                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
+                    a2 = TCG_VEC_TMP;
+                }
+                insn = vec_cmp_insn[cond];
+                if (insn == 0) {
+                    TCGArg t;
+                    t = a1, a1 = a2, a2 = t;
+                    cond = tcg_swap_cond(cond);
+                    insn = vec_cmp_insn[cond];
+                    tcg_debug_assert(insn != 0);
+                }
+                tcg_out_vreg3(s, insn, q, vece, a0, a1, a2);
+            }
+        }
+        return;
+
+    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
+    default:
+        g_assert_not_reached();
+    }
 }
 
 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 {
-    return 0;
+    switch (opc) {
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+        return 1;
+    case INDEX_op_cmp_vec:
+        return vece < MO_64;
+    default:
+        return 0;
+    }
 }
 
 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
-- 
2.25.1

These logical and arithmetic operations are optional, but are
trivial to accomplish with the existing infrastructure.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  1 +
 tcg/arm/tcg-target.h         | 10 +++++-----
 tcg/arm/tcg-target.c.inc     | 38 ++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

This consists of the three immediate shifts: shli, shri, sari.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 +-
 tcg/arm/tcg-target.c.inc | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_roti_vec         0
 #define TCG_TARGET_HAS_rots_vec         0
 #define TCG_TARGET_HAS_rotv_vec         0
-#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shi_vec          1
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          0
 #define TCG_TARGET_HAS_mul_vec          0
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VCGE_U    = 0xf3000310,
     INSN_VCGT_U    = 0xf3000300,
 
+    INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
+    INSN_VSARI     = 0xf2800010,  /* VSHR.S */
+    INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
+
     INSN_VTST      = 0xf2000810,
 
     INSN_VDUP_G    = 0xee800b10,  /* VDUP (ARM core register) */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vmovi(TCGContext *s, TCGReg rd,
               | (extract32(imm8, 7, 1) << 24));
 }
 
+static void tcg_out_vshifti(TCGContext *s, ARMInsn insn, int q,
+                            TCGReg rd, TCGReg rm, int l_imm6)
+{
+    tcg_out32(s, insn | (q << 6) | encode_vd(rd) | encode_vm(rm) |
+              (extract32(l_imm6, 6, 1) << 7) |
+              (extract32(l_imm6, 0, 6) << 16));
+}
+
 static void tcg_out_vldst(TCGContext *s, ARMInsn insn,
                           TCGReg rd, TCGReg rn, int offset)
 {
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_abs_vec:
     case INDEX_op_neg_vec:
     case INDEX_op_not_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
         return C_O1_I1(w, w);
     case INDEX_op_dup2_vec:
     case INDEX_op_add_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_xor_vec:
         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
         return;
+    case INDEX_op_shli_vec:
+        tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
+        return;
+    case INDEX_op_shri_vec:
+        tcg_out_vshifti(s, INSN_VSHRI, q, a0, a1, (16 << vece) - a2);
+        return;
+    case INDEX_op_sari_vec:
+        tcg_out_vshifti(s, INSN_VSARI, q, a0, a1, (16 << vece) - a2);
+        return;
 
     case INDEX_op_andc_vec:
         if (!const_args[2]) {
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_orc_vec:
     case INDEX_op_xor_vec:
     case INDEX_op_not_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
         return 1;
     case INDEX_op_abs_vec:
     case INDEX_op_cmp_vec:
-- 
2.25.1

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     | 2 +-
 tcg/arm/tcg-target.c.inc | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

This is saturating add and subtract, signed and unsigned.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 +-
 tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          0
 #define TCG_TARGET_HAS_mul_vec          1
-#define TCG_TARGET_HAS_sat_vec          0
+#define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       0
 #define TCG_TARGET_HAS_bitsel_vec       0
 #define TCG_TARGET_HAS_cmpsel_vec       0
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VORR      = 0xf2200110,
     INSN_VSUB      = 0xf3000800,
     INSN_VMUL      = 0xf2000910,
+    INSN_VQADD     = 0xf2000010,
+    INSN_VQADD_U   = 0xf3000010,
+    INSN_VQSUB     = 0xf2000210,
+    INSN_VQSUB_U   = 0xf3000210,
 
     INSN_VABS      = 0xf3b10300,
     INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_dup2_vec:
     case INDEX_op_add_vec:
     case INDEX_op_mul_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
     case INDEX_op_sub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
     case INDEX_op_xor_vec:
         return C_O1_I2(w, w, w);
     case INDEX_op_or_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sub_vec:
         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
         return;
+    case INDEX_op_ssadd_vec:
+        tcg_out_vreg3(s, INSN_VQADD, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_sssub_vec:
+        tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_usadd_vec:
+        tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_ussub_vec:
+        tcg_out_vreg3(s, INSN_VQSUB_U, q, vece, a0, a1, a2);
+        return;
     case INDEX_op_xor_vec:
         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
         return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
         return 1;
     case INDEX_op_abs_vec:
     case INDEX_op_cmp_vec:
-- 
2.25.1

This is minimum and maximum, signed and unsigned.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.h     |  2 +-
 tcg/arm/tcg-target.c.inc | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool use_neon_instructions;
 #define TCG_TARGET_HAS_shv_vec          0
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
-#define TCG_TARGET_HAS_minmax_vec       0
+#define TCG_TARGET_HAS_minmax_vec       1
 #define TCG_TARGET_HAS_bitsel_vec       0
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VQADD_U   = 0xf3000010,
     INSN_VQSUB     = 0xf2000210,
     INSN_VQSUB_U   = 0xf3000210,
+    INSN_VMAX      = 0xf2000600,
+    INSN_VMAX_U    = 0xf3000600,
+    INSN_VMIN      = 0xf2000610,
+    INSN_VMIN_U    = 0xf3000610,
 
     INSN_VABS      = 0xf3b10300,
     INSN_VMVN      = 0xf3b00580,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_dup2_vec:
     case INDEX_op_add_vec:
     case INDEX_op_mul_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
     case INDEX_op_ssadd_vec:
     case INDEX_op_sssub_vec:
     case INDEX_op_sub_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
     case INDEX_op_xor_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_mul_vec:
         tcg_out_vreg3(s, INSN_VMUL, q, vece, a0, a1, a2);
         return;
+    case INDEX_op_smax_vec:
+        tcg_out_vreg3(s, INSN_VMAX, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_smin_vec:
+        tcg_out_vreg3(s, INSN_VMIN, q, vece, a0, a1, a2);
+        return;
     case INDEX_op_sub_vec:
         tcg_out_vreg3(s, INSN_VSUB, q, vece, a0, a1, a2);
         return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sssub_vec:
         tcg_out_vreg3(s, INSN_VQSUB, q, vece, a0, a1, a2);
         return;
+    case INDEX_op_umax_vec:
+        tcg_out_vreg3(s, INSN_VMAX_U, q, vece, a0, a1, a2);
+        return;
+    case INDEX_op_umin_vec:
+        tcg_out_vreg3(s, INSN_VMIN_U, q, vece, a0, a1, a2);
+        return;
     case INDEX_op_usadd_vec:
         tcg_out_vreg3(s, INSN_VQADD_U, q, vece, a0, a1, a2);
         return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_cmp_vec:
     case INDEX_op_mul_vec:
     case INDEX_op_neg_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
         return vece < MO_64;
     default:
         return 0;
-- 
2.25.1

NEON has 3 instructions implementing this 4 argument operation,
with each insn overlapping a different logical input onto the
destination register.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  1 +
 tcg/arm/tcg-target.h         |  2 +-
 tcg/arm/tcg-target.c.inc     | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

The three vector shift by vector operations are all implemented via
expansion.  Therefore do not actually set TCG_TARGET_HAS_shv_vec,
as none of shlv_vec, shrv_vec, sarv_vec may actually appear in the
instruction stream, and therefore also do not appear in tcg_target_op_def.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.opc.h |  3 ++
 tcg/arm/tcg-target.c.inc | 61 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.opc.h b/tcg/arm/tcg-target.opc.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.opc.h
+++ b/tcg/arm/tcg-target.opc.h
@@ -XXX,XX +XXX,XX @@
  * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
  * consider these to be UNSPEC with names.
  */
+
+DEF(arm_sshl_vec, 1, 2, 0, IMPLVEC)
+DEF(arm_ushl_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     INSN_VSHLI     = 0xf2800510,  /* VSHL (immediate) */
     INSN_VSARI     = 0xf2800010,  /* VSHR.S */
     INSN_VSHRI     = 0xf3800010,  /* VSHR.U */
+    INSN_VSHL_S    = 0xf2000400,  /* VSHL.S (register) */
+    INSN_VSHL_U    = 0xf3000400,  /* VSHL.U (register) */
 
     INSN_VBSL      = 0xf3100110,
     INSN_VBIT      = 0xf3200110,
@@ -XXX,XX +XXX,XX @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
     case INDEX_op_xor_vec:
+    case INDEX_op_arm_sshl_vec:
+    case INDEX_op_arm_ushl_vec:
         return C_O1_I2(w, w, w);
     case INDEX_op_or_vec:
     case INDEX_op_andc_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_xor_vec:
         tcg_out_vreg3(s, INSN_VEOR, q, 0, a0, a1, a2);
         return;
+    case INDEX_op_arm_sshl_vec:
+        /*
+         * Note that Vm is the data and Vn is the shift count,
+         * therefore the arguments appear reversed.
+         */
+        tcg_out_vreg3(s, INSN_VSHL_S, q, vece, a0, a2, a1);
+        return;
+    case INDEX_op_arm_ushl_vec:
+        /* See above. */
+        tcg_out_vreg3(s, INSN_VSHL_U, q, vece, a0, a2, a1);
+        return;
     case INDEX_op_shli_vec:
         tcg_out_vshifti(s, INSN_VSHLI, q, a0, a1, a2 + (8 << vece));
         return;
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_umax_vec:
     case INDEX_op_umin_vec:
         return vece < MO_64;
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        return -1;
     default:
         return 0;
     }
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
-    g_assert_not_reached();
+    va_list va;
+    TCGv_vec v0, v1, v2, t1;
+    TCGArg a2;
+
+    va_start(va, a0);
+    v0 = temp_tcgv_vec(arg_temp(a0));
+    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+    a2 = va_arg(va, TCGArg);
+    va_end(va);
+
+    switch (opc) {
+    case INDEX_op_shlv_vec:
+        /*
+         * Merely propagate shlv_vec to arm_ushl_vec.
+         * In this way we don't set TCG_TARGET_HAS_shv_vec
+         * because everything is done via expansion.
+         */
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        break;
+
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        /* Right shifts are negative left shifts for NEON.  */
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        tcg_gen_neg_vec(vece, t1, v2);
+        if (opc == INDEX_op_shrv_vec) {
+            opc = INDEX_op_arm_ushl_vec;
+        } else {
+            opc = INDEX_op_arm_sshl_vec;
+        }
+        vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        tcg_temp_free_vec(t1);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
 }
 
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
-- 
2.25.1

Implement via expansion, so don't actually set TCG_TARGET_HAS_roti_vec.
For NEON, this is shift-right followed by shift-left-and-insert.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-con-set.h |  1 +
 tcg/arm/tcg-target.opc.h     |  1 +
 tcg/arm/tcg-target.c.inc     | 15 +++++++++++++++
 3 files changed, 17 insertions(+)

Implement via expansion, so don't actually set TCG_TARGET_HAS_rotv_vec.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target.c.inc | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
     case INDEX_op_rotli_vec:
+    case INDEX_op_rotlv_vec:
+    case INDEX_op_rotrv_vec:
         return -1;
     default:
         return 0;
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
     va_list va;
-    TCGv_vec v0, v1, v2, t1;
+    TCGv_vec v0, v1, v2, t1, t2, c1;
     TCGArg a2;
 
     va_start(va, a0);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
         tcg_temp_free_vec(t1);
         break;
 
+    case INDEX_op_rotlv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_sub_vec(vece, t1, v2, c1);
+        /* Right shifts are negative left shifts for NEON.  */
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_or_vec(vece, v0, v0, t1);
+        tcg_temp_free_vec(t1);
+        break;
+
+    case INDEX_op_rotrv_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        t1 = tcg_temp_new_vec(type);
+        t2 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_neg_vec(vece, t1, v2);
+        tcg_gen_sub_vec(vece, t2, c1, v2);
+        /* Right shifts are negative left shifts for NEON.  */
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+        vec_gen_3(INDEX_op_arm_ushl_vec, type, vece, tcgv_vec_arg(t2),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+        tcg_gen_or_vec(vece, v0, t1, t2);
+        tcg_temp_free_vec(t1);
+        tcg_temp_free_vec(t2);
+        break;
+
     default:
         g_assert_not_reached();
     }
-- 
2.25.1

The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:

Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027

for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:

tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)

----------------------------------------------------------------
Improvements to qemu/int128
Fixes for 128/64 division.
Cleanup tcg/optimize.c
Optimize redundant sign extensions

----------------------------------------------------------------
Frédéric Pétrot (1):
      qemu/int128: Add int128_{not,xor}

Luis Pires (4):
      host-utils: move checks out of divu128/divs128
      host-utils: move udiv_qrnnd() to host-utils
      host-utils: add 128-bit quotient support to divu128/divs128
      host-utils: add unit tests for divu128/divs128

Richard Henderson (51):
      tcg/optimize: Rename "mask" to "z_mask"
      tcg/optimize: Split out OptContext
      tcg/optimize: Remove do_default label
      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
      tcg/optimize: Move prev_mb into OptContext
      tcg/optimize: Split out init_arguments
      tcg/optimize: Split out copy_propagate
      tcg/optimize: Split out fold_call
      tcg/optimize: Drop nb_oargs, nb_iargs locals
      tcg/optimize: Change fail return for do_constant_folding_cond*
      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
      tcg/optimize: Split out finish_folding
      tcg/optimize: Use a boolean to avoid a mass of continues
      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
      tcg/optimize: Split out fold_const{1,2}
      tcg/optimize: Split out fold_setcond2
      tcg/optimize: Split out fold_brcond2
      tcg/optimize: Split out fold_brcond
      tcg/optimize: Split out fold_setcond
      tcg/optimize: Split out fold_mulu2_i32
      tcg/optimize: Split out fold_addsub2_i32
      tcg/optimize: Split out fold_movcond
      tcg/optimize: Split out fold_extract2
      tcg/optimize: Split out fold_extract, fold_sextract
      tcg/optimize: Split out fold_deposit
      tcg/optimize: Split out fold_count_zeros
      tcg/optimize: Split out fold_bswap
      tcg/optimize: Split out fold_dup, fold_dup2
      tcg/optimize: Split out fold_mov
      tcg/optimize: Split out fold_xx_to_i
      tcg/optimize: Split out fold_xx_to_x
      tcg/optimize: Split out fold_xi_to_i
      tcg/optimize: Add type to OptContext
      tcg/optimize: Split out fold_to_not
      tcg/optimize: Split out fold_sub_to_neg
      tcg/optimize: Split out fold_xi_to_x
      tcg/optimize: Split out fold_ix_to_i
      tcg/optimize: Split out fold_masks
      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
      tcg/optimize: Sink commutative operand swapping into fold functions
      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
      tcg/optimize: Use fold_xx_to_i for orc
      tcg/optimize: Use fold_xi_to_x for mul
      tcg/optimize: Use fold_xi_to_x for div
      tcg/optimize: Use fold_xx_to_i for rem
      tcg/optimize: Optimize sign extensions
      tcg/optimize: Propagate sign info for logical operations
      tcg/optimize: Propagate sign info for setcond
      tcg/optimize: Propagate sign info for bit counting
      tcg/optimize: Propagate sign info for shifting

From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>

Addition of not and xor on 128-bit integers.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
[rth: Split out logical operations.]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/int128.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return a;
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return ~a;
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return a | b;
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return a ^ b;
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return int128_make128(a, (a < 0) ? -1 : 0);
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return int128_make128(~a.lo, ~a.hi);
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     int64_t h;
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

In preparation for changing the divu128/divs128 implementations
to allow for quotients larger than 64 bits, move the div-by-zero
and overflow checks to the callers.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |  5 +++--
 include/qemu/host-utils.h | 34 ++++++++++++---------------------
 target/ppc/int_helper.c   | 14 +++++++++-----
 util/host-utils.c         | 40 ++++++++++++++++++---------------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
         return 0;
     }
     /*
-     * Ignore divu128() return value as we've caught div-by-zero and don't
-     * need different behaviour for overflow.
+     * BUG: when CONFIG_INT128 is not defined, the current implementation of
+     * divu128 does not return a valid truncated quotient, so the result will
+     * be wrong.
      */
     divu128(&lo, &hi, clk->period);
     return lo;
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
-        __uint128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result > UINT64_MAX;
-    }
+    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
+    __uint128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
-        __int128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result != *plow;
-    }
+    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
     uint64_t rt = 0;
     int overflow = 0;
 
-    overflow = divu128(&rt, &ra, rb);
-
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || ra >= rb)) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divu128(&rt, &ra, rb);
     }
 
     if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
     int64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
-    int overflow = divs128(&rt, &ra, rb);
+    int overflow = 0;
 
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divs128(&rt, &ra, rb);
     }
 
     if (oe) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
     *phigh = rh;
 }
 
-/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
-/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
-/* remainder via phigh. */
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+/*
+ * Unsigned 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
     unsigned i;
     uint64_t carry = 0;
 
-    if (divisor == 0) {
-        return 1;
-    } else if (dhi == 0) {
+    if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
         *phigh = dlo % divisor;
-        return 0;
-    } else if (dhi >= divisor) {
-        return 1;
     } else {
 
         for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 
         *plow = dlo;
         *phigh = dhi;
-        return 0;
     }
 }
 
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+/*
+ * Signed 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
     int sgn_dvdnd = *phigh < 0;
     int sgn_divsr = divisor < 0;
-    int overflow = 0;
 
     if (sgn_dvdnd) {
         *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
         divisor = 0 - divisor;
     }
 
-    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
+    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 
     if (sgn_dvdnd  ^ sgn_divsr) {
         *plow = 0 - *plow;
     }
-
-    if (!overflow) {
-        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
-            overflow = 1;
-        }
-    }
-
-    return overflow;
 }
 #endif
 
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
so it can be reused by divu128().

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-macros.h | 82 ----------------------------------
 include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
  * so some portions are provided under:
  *  the SoftFloat-2a license
  *  the BSD license
- *  GPL-v2-or-later
  *
  * Any future contributions to this file after December 1st 2014 will be
  * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* Portions of this work are licensed under the terms of the GNU GPL,
- * version 2 or later. See the COPYING file in the top-level directory.
- */
-
 #ifndef FPU_SOFTFLOAT_MACROS_H
 #define FPU_SOFTFLOAT_MACROS_H
 
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 
 }
 
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
- *
- * Licensed under the GPLv2/LGPLv3
- */
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
-                                  uint64_t n0, uint64_t d)
-{
-#if defined(__x86_64__)
-    uint64_t q;
-    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
-    return q;
-#elif defined(__s390x__) && !defined(__clang__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
-    /* From Power ISA 2.06, programming note for divdeu.  */
-    uint64_t q1, q2, Q, r1, r2, R;
-    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
-        : "=&r"(q1), "=r"(q2)
-        : "r"(n1), "r"(n0), "r"(d));
-    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
-    r2 = n0 - (q2 * d);
-    Q = q1 + q2;
-    R = r1 + r2;
-    if (R >= d || R < r2) { /* overflow implies R > d */
-        Q += 1;
-        R -= d;
-    }
-    *r = R;
-    return Q;
-#else
-    uint64_t d0, d1, q0, q1, r1, r0, m;
-
-    d0 = (uint32_t)d;
-    d1 = d >> 32;
-
-    r1 = n1 % d1;
-    q1 = n1 / d1;
-    m = q1 * d0;
-    r1 = (r1 << 32) | (n0 >> 32);
-    if (r1 < m) {
-        q1 -= 1;
-        r1 += d;
-        if (r1 >= d) {
-            if (r1 < m) {
-                q1 -= 1;
-                r1 += d;
-            }
-        }
-    }
-    r1 -= m;
-
-    r0 = r1 % d1;
-    q0 = r1 / d1;
-    m = q0 * d0;
-    r0 = (r0 << 32) | (uint32_t)n0;
-    if (r0 < m) {
-        q0 -= 1;
-        r0 += d;
-        if (r0 >= d) {
-            if (r0 < m) {
-                q0 -= 1;
-                r0 += d;
-            }
-        }
-    }
-    r0 -= m;
-
-    *r = r0;
-    return (q1 << 32) | q0;
-#endif
-}
-
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
  */
 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
+                                  uint64_t n0, uint64_t d)
+{
+#if defined(__x86_64__)
+    uint64_t q;
+    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
+    return q;
+#elif defined(__s390x__) && !defined(__clang__)
+    /* Need to use a TImode type to get an even register pair for DLGR.  */
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
+    *r = n >> 64;
+    return n;
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
+    /* From Power ISA 2.06, programming note for divdeu.  */
+    uint64_t q1, q2, Q, r1, r2, R;
+    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
+        : "=&r"(q1), "=r"(q2)
+        : "r"(n1), "r"(n0), "r"(d));
+    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
+    r2 = n0 - (q2 * d);
+    Q = q1 + q2;
+    R = r1 + r2;
+    if (R >= d || R < r2) { /* overflow implies R > d */
+        Q += 1;
+        R -= d;
+    }
+    *r = R;
+    return Q;
+#else
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    *r = r0;
+    return (q1 << 32) | q0;
+#endif
+}
+
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

These will be used to implement new decimal floating point
instructions from Power ISA 3.1.

The remainder is now returned directly by divu128/divs128,
freeing up phigh to receive the high 64 bits of the quotient.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |   6 +-
 include/qemu/host-utils.h |  20 ++++--
 target/ppc/int_helper.c   |   9 +--
 util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 60 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
     if (clk->period == 0) {
         return 0;
     }
-    /*
-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
-     * divu128 does not return a valid truncated quotient, so the result will
-     * be wrong.
-     */
+
     divu128(&lo, &hi, clk->period);
     return lo;
 }
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
+                               uint64_t divisor)
 {
     __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
     __uint128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
+                              int64_t divisor)
 {
-    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
     __int128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
 
 uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
 {
-    int64_t rt = 0;
+    uint64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
     int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
     int cr;
     uint64_t lo_value;
     uint64_t hi_value;
+    uint64_t rem;
     ppc_avr_t ret = { .u64 = { 0, 0 } };
 
     if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
          * In that case, we leave r unchanged.
          */
     } else {
-        divu128(&lo_value, &hi_value, 1000000000000000ULL);
+        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 
-        for (i = 1; i < 16; hi_value /= 10, i++) {
-            bcd_put_digit(&ret, hi_value % 10, i);
+        for (i = 1; i < 16; rem /= 10, i++) {
+            bcd_put_digit(&ret, rem % 10, i);
         }
 
         for (; i < 32; lo_value /= 10, i++) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
 }
 
 /*
- * Unsigned 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
-    unsigned i;
-    uint64_t carry = 0;
+    uint64_t rem, dhighest;
+    int sh;
 
     if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
-        *phigh = dlo % divisor;
+        *phigh = 0;
+        return dlo % divisor;
     } else {
+        sh = clz64(divisor);
 
-        for (i = 0; i < 64; i++) {
-            carry = dhi >> 63;
-            dhi = (dhi << 1) | (dlo >> 63);
-            if (carry || (dhi >= divisor)) {
-                dhi -= divisor;
-                carry = 1;
-            } else {
-                carry = 0;
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
             }
-            dlo = (dlo << 1) | carry;
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
         }
 
-        *plow = dlo;
-        *phigh = dhi;
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
     }
 }
 
 /*
- * Signed 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    int sgn_dvdnd = *phigh < 0;
-    int sgn_divsr = divisor < 0;
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
 
-    if (sgn_dvdnd) {
-        *plow = ~(*plow);
-        *phigh = ~(*phigh);
-        if (*plow == (int64_t)-1) {
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
             *plow = 0;
-            (*phigh)++;
-         } else {
-            (*plow)++;
-         }
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
     }
 
-    if (sgn_divsr) {
-        divisor = 0 - divisor;
-    }
-
-    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
-
-    if (sgn_dvdnd  ^ sgn_divsr) {
-        *plow = 0 - *plow;
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
     }
 }
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
 tests/unit/meson.build   |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 tests/unit/test-div128.c

diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test 128-bit division functions
+ *
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+typedef struct {
+    uint64_t high;
+    uint64_t low;
+    uint64_t rhigh;
+    uint64_t rlow;
+    uint64_t divisor;
+    uint64_t remainder;
+} test_data_unsigned;
+
+typedef struct {
+    int64_t high;
+    uint64_t low;
+    int64_t rhigh;
+    uint64_t rlow;
+    int64_t divisor;
+    int64_t remainder;
+} test_data_signed;
+
+static const test_data_unsigned test_table_unsigned[] = {
+    /* Dividend fits in 64 bits */
+    { 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000003ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000002ULL, 0x0000000000000001ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0xa000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000002ULL,
+      0x4000000000000000ULL, 0x2000000000000000ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x8000000000000000ULL, 0x0000000000000000ULL},
+
+    /* Dividend > 64 bits, with MSB 0 */
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x000000000000000dULL,
+      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
+      0x0000000000000010ULL, 0x0000000000000001ULL},
+
+    /* Dividend > 64 bits, with MSB 1 */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
+      0x0000000000000010ULL, 0x000000000000000fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
+      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
+
+    /**
+     * Divisor == 64 bits, with MSB 1
+     * and high 64 bits of dividend >= divisor
+     * (for testing normalization)
+     */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0xfddbb9977553310aULL,
+      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
+
+    /* Dividend > 64 bits, divisor almost as big */
+    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
+      0x0000000000000000ULL, 0x000000000000000fULL,
+      0x123456789abcdefeULL, 0x123456789abcde1fULL},
+};
+
+static const test_data_signed test_table_signed[] = {
+    /* Positive dividend, positive/negative divisors */
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0x0000000000000008LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0xfffffffffffffff8LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0x0000000000000237LL, 0x0000000000000183LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0xfffffffffffffdc9LL, 0x0000000000000183LL},
+
+    /* Negative dividend, positive/negative divisors */
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0x0000000000000008LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0x0000000000000237LL, 0xfffffffffffffe7dLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
+};
+
+static void test_divu128(void)
+{
+    int i;
+    uint64_t rem;
+    test_data_unsigned tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
+        tmp = test_table_unsigned[i];
+
+        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+static void test_divs128(void)
+{
+    int i;
+    int64_t rem;
+    test_data_signed tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
+        tmp = test_table_signed[i];
+
+        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/host-utils/test_divu128", test_divu128);
+    g_test_add_func("/host-utils/test_divs128", test_divs128);
+    return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
   # all code tested by test-x86-cpuid is inside topology.h
   'test-x86-cpuid': [],
   'test-cutils': [],
+  'test-div128': [],
   'test-shift128': [],
   'test-mul64': [],
   # all code tested by test-int128 is inside int128.h
-- 
2.25.1

Prepare for tracking different masks by renaming this one.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     uint64_t val;
-    uint64_t mask;
+    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->next_copy = ts;
     ti->prev_copy = ts;
     ti->is_const = false;
-    ti->mask = -1;
+    ti->z_mask = -1;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
-        ti->mask = ts->val;
+        ti->z_mask = ts->val;
         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
             /* High bits of a 32-bit quantity are garbage.  */
-            ti->mask |= ~0xffffffffull;
+            ti->z_mask |= ~0xffffffffull;
         }
     } else {
         ti->is_const = false;
-        ti->mask = -1;
+        ti->z_mask = -1;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t mask;
+    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    mask = si->mask;
+    z_mask = si->z_mask;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
         /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
+        z_mask |= ~0xffffffffull;
     }
-    di->mask = mask;
+    di->z_mask = z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t mask, partmask, affected, tmp;
+        uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
-        mask = -1;
+        z_mask = -1;
         affected = -1;
         switch (opc) {
         CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext8u):
-            mask = 0xff;
+            z_mask = 0xff;
             goto and_const;
         CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             goto and_const;
         case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_ext32u_i64:
-            mask = 0xffffffffU;
+            z_mask = 0xffffffffU;
             goto and_const;
 
         CASE_OP_32_64(and):
-            mask = arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[2])->z_mask;
             if (arg_is_const(op->args[2])) {
         and_const:
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
-            mask = arg_info(op->args[1])->mask & mask;
+            z_mask = arg_info(op->args[1])->z_mask & z_mask;
             break;
 
         case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_extu_i32_i64:
             /* We do not compute affected as it is a size changing op.  */
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
 
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                op->args[2] is constant, we can't infer anything from it.  */
             if (arg_is_const(op->args[2])) {
-                mask = ~arg_info(op->args[2])->mask;
+                z_mask = ~arg_info(op->args[2])->z_mask;
                 goto and_const;
             }
             /* But we certainly know nothing outside args[1] may be set. */
-            mask = arg_info(op->args[1])->mask;
+            z_mask = arg_info(op->args[1])->z_mask;
             break;
 
         case INDEX_op_sar_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_extrl_i64_i32:
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
         case INDEX_op_extrh_i64_i32:
-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
+            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                mask = arg_info(op->args[1])->mask << tmp;
+                z_mask = arg_info(op->args[1])->z_mask << tmp;
             }
             break;
 
         CASE_OP_32_64(neg):
             /* Set to 1 all bits to the left of the rightmost.  */
-            mask = -(arg_info(op->args[1])->mask
-                     & -arg_info(op->args[1])->mask);
+            z_mask = -(arg_info(op->args[1])->z_mask
+                       & -arg_info(op->args[1])->z_mask);
             break;
 
         CASE_OP_32_64(deposit):
-            mask = deposit64(arg_info(op->args[1])->mask,
-                             op->args[3], op->args[4],
-                             arg_info(op->args[2])->mask);
+            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                               op->args[3], op->args[4],
+                               arg_info(op->args[2])->z_mask);
             break;
 
         CASE_OP_32_64(extract):
-            mask = extract64(arg_info(op->args[1])->mask,
-                             op->args[2], op->args[3]);
+            z_mask = extract64(arg_info(op->args[1])->z_mask,
+                               op->args[2], op->args[3]);
             if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
         CASE_OP_32_64(sextract):
-            mask = sextract64(arg_info(op->args[1])->mask,
-                              op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+                                op->args[2], op->args[3]);
+            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
 
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[1])->z_mask
+                   | arg_info(op->args[2])->z_mask;
             break;
 
         case INDEX_op_clz_i32:
         case INDEX_op_ctz_i32:
-            mask = arg_info(op->args[2])->mask | 31;
+            z_mask = arg_info(op->args[2])->z_mask | 31;
             break;
 
         case INDEX_op_clz_i64:
         case INDEX_op_ctz_i64:
-            mask = arg_info(op->args[2])->mask | 63;
+            z_mask = arg_info(op->args[2])->z_mask | 63;
             break;
 
         case INDEX_op_ctpop_i32:
-            mask = 32 | 31;
+            z_mask = 32 | 31;
             break;
         case INDEX_op_ctpop_i64:
-            mask = 64 | 63;
+            z_mask = 64 | 63;
             break;
 
         CASE_OP_32_64(setcond):
         case INDEX_op_setcond2_i32:
-            mask = 1;
+            z_mask = 1;
             break;
 
         CASE_OP_32_64(movcond):
-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
+            z_mask = arg_info(op->args[3])->z_mask
+                   | arg_info(op->args[4])->z_mask;
             break;
 
         CASE_OP_32_64(ld8u):
-            mask = 0xff;
+            z_mask = 0xff;
             break;
         CASE_OP_32_64(ld16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             break;
         case INDEX_op_ld32u_i64:
-            mask = 0xffffffffu;
+            z_mask = 0xffffffffu;
             break;
 
         CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
             }
             break;
 
         CASE_OP_32_64(bswap16):
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffff) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffff) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap16(mask);
+            z_mask = bswap16(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int16_t)mask;
+                z_mask = (int16_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(16, 48);
+                z_mask |= MAKE_64BIT_MASK(16, 48);
                 break;
             }
             break;
 
         case INDEX_op_bswap32_i64:
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffffffffu) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffffffffu) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap32(mask);
+            z_mask = bswap32(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int32_t)mask;
+                z_mask = (int32_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(32, 32);
+                z_mask |= MAKE_64BIT_MASK(32, 32);
                 break;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         /* 32-bit ops generate 32-bit results.  For the result is zero test
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
-        partmask = mask;
+        partmask = z_mask;
         if (!(def->flags & TCG_OPF_64BIT)) {
-            mask |= ~(tcg_target_ulong)0xffffffffu;
+            z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                    vs the high word of the input.  */
             do_setcond_high:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             /* Default case: we know nothing about operation (or were unable
                to compute the operation result) so no propagation is done.
                We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "mask" is
+               block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
                 memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
                     if (i == 0) {
-                        arg_info(op->args[i])->mask = mask;
+                        arg_info(op->args[i])->z_mask = z_mask;
                     }
                 }
             }
-- 
2.25.1

Provide what will become a larger context for splitting
the very large tcg_optimize function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
+typedef struct OptContext {
+    TCGTempSet temps_used;
+} OptContext;
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     TempOptInfo *ti;
 
-    if (test_bit(idx, temps_used->l)) {
+    if (test_bit(idx, ctx->temps_used.l)) {
         return;
     }
-    set_bit(idx, temps_used->l);
+    set_bit(idx, ctx->temps_used.l);
 
     ti = ts->state_ptr;
     if (ti == NULL) {
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(OptContext *ctx, TCGArg arg)
 {
-    init_ts_info(temps_used, arg_temp(arg));
+    init_ts_info(ctx, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                              TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
-    init_ts_info(temps_used, tv);
+    init_ts_info(ctx, tv);
     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TCGTempSet temps_used;
+    OptContext ctx = {};
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(&temps_used, ts);
+                    init_ts_info(&ctx, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&temps_used, op->args[i]);
+                init_arg_info(&ctx, op->args[i]);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, temps_used.l)) {
+                    if (test_bit(i, ctx.temps_used.l)) {
                         reset_ts(&s->temps[i]);
                     }
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Break the final cleanup clause out of the main switch
statement.  When fully folding an opcode to mov/movi,
use "continue" to process the next opcode, else break
to fall into the final cleanup.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 96 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-            break;
+            continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
-                break;
+                continue;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
                 nb_iargs = 1;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(deposit):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract):
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(sextract):
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                     ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(brcond):
             tmp = do_constant_folding_cond(opc, op->args[0],
                                            op->args[1], op->args[2]);
-            if (tmp != 2) {
-                if (tmp) {
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[3];
-                } else {
-                    tcg_op_remove(s, op);
-                }
+            switch (tmp) {
+            case 0:
+                tcg_op_remove(s, op);
+                continue;
+            case 1:
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[3];
                 break;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
-                break;
+                continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                 uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-                    goto do_default;
+                    break;
                 }
                 op->args[3] = cond;
                 op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  : INDEX_op_setcond_i64);
                 nb_iargs = 2;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_mulu2_i32:
             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                             op->args[4]);
-            if (tmp != 2) {
-                if (tmp) {
-            do_brcond_true:
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[5];
-                } else {
+            if (tmp == 0) {
             do_brcond_false:
-                    tcg_op_remove(s, op);
-                }
-            } else if ((op->args[4] == TCG_COND_LT
-                        || op->args[4] == TCG_COND_GE)
-                       && arg_is_const(op->args[2])
-                       && arg_info(op->args[2])->val == 0
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0) {
+                tcg_op_remove(s, op);
+                continue;
+            }
+            if (tmp == 1) {
+            do_brcond_true:
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[5];
+                break;
+            }
+            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+                 && arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
+                op->opc = opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_brcond_false;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_brcond_low:
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_brcond_true;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
             do_setcond_const:
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-            } else if ((op->args[5] == TCG_COND_LT
-                        || op->args[5] == TCG_COND_GE)
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0
-                       && arg_is_const(op->args[4])
-                       && arg_info(op->args[4])->val == 0) {
+                continue;
+            }
+            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0
+                 && arg_is_const(op->args[4])
+                 && arg_info(op->args[4])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_setcond_high;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_setcond_const;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
-        case INDEX_op_call:
-            if (!(tcg_call_flags(op)
+        default:
+            break;
+        }
+
+        /* Some of the folding above can change opc. */
+        opc = op->opc;
+        def = &tcg_op_defs[opc];
+        if (def->flags & TCG_OPF_BB_END) {
+            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+        } else {
+            if (opc == INDEX_op_call &&
+                !(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            goto do_reset_output;
 
-        default:
-        do_default:
-            /* Default case: we know nothing about operation (or were unable
-               to compute the operation result) so no propagation is done.
-               We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "z_mask" is
-               the non-zero bits mask for the first output arg.  */
-            if (def->flags & TCG_OPF_BB_END) {
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-            } else {
-        do_reset_output:
-                for (i = 0; i < nb_oargs; i++) {
-                    reset_temp(op->args[i]);
-                    /* Save the corresponding known-zero bits mask for the
-                       first output argument (only one supported so far). */
-                    if (i == 0) {
-                        arg_info(op->args[i])->z_mask = z_mask;
-                    }
+            for (i = 0; i < nb_oargs; i++) {
+                reset_temp(op->args[i]);
+                /* Save the corresponding known-zero bits mask for the
+                   first output argument (only one supported so far). */
+                if (i == 0) {
+                    arg_info(op->args[i])->z_mask = z_mask;
                 }
             }
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1

Adjust the interface to take the OptContext parameter instead
of TCGContext or both.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 } TempOptInfo;
 
 typedef struct OptContext {
+    TCGContext *tcg;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
-        tcg_op_remove(s, op);
+        tcg_op_remove(ctx->tcg, op);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-                             TCGOp *op, TCGArg dst, uint64_t val)
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                             TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    OptContext ctx = {};
+    OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(or):
         CASE_OP_32_64_VEC(and):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 } else {
-                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                 }
                 continue;
             }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-- 
2.25.1

This will expose the variable to subroutines that
will be broken out of tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 
 typedef struct OptContext {
     TCGContext *tcg;
+    TCGOp *prev_mb;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
-    TCGOp *op, *op_next, *prev_mb = NULL;
+    TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-        if (prev_mb) {
+        if (ctx.prev_mb) {
             switch (opc) {
             case INDEX_op_mb:
                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  * barrier.  This is stricter than specified but for
                  * the purposes of TCG is better than not optimizing.
                  */
-                prev_mb->args[0] |= op->args[0];
+                ctx.prev_mb->args[0] |= op->args[0];
                 tcg_op_remove(s, op);
                 break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
-                prev_mb = NULL;
+                ctx.prev_mb = NULL;
                 break;
             }
         } else if (opc == INDEX_op_mb) {
-            prev_mb = op;
+            ctx.prev_mb = op;
         }
     }
 }
-- 
2.25.1

There was no real reason for calls to have separate code here.
Unify init for calls vs non-calls using the call path, which
handles TCG_CALL_DUMMY_ARG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(OptContext *ctx, TCGArg arg)
-{
-    init_ts_info(ctx, arg_temp(arg));
-}
-
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+{
+    for (int i = 0; i < nb_args; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts) {
+            init_ts_info(ctx, ts);
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (opc == INDEX_op_call) {
             nb_oargs = TCGOP_CALLO(op);
             nb_iargs = TCGOP_CALLI(op);
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                TCGTemp *ts = arg_temp(op->args[i]);
-                if (ts) {
-                    init_ts_info(&ctx, ts);
-                }
-            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&ctx, op->args[i]);
-            }
         }
+        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-- 
2.25.1

Continue splitting tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
     }
 }
 
+static void copy_propagate(OptContext *ctx, TCGOp *op,
+                           int nb_oargs, int nb_iargs)
+{
+    TCGContext *s = ctx->tcg;
+
+    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts && ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(s, ts));
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             nb_iargs = def->nb_iargs;
         }
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-
-        /* Do copy propagation */
-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            TCGTemp *ts = arg_temp(op->args[i]);
-            if (ts && ts_is_copy(ts)) {
-                op->args[i] = temp_arg(find_better_copy(s, ts));
-            }
-        }
+        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
-- 
2.25.1

Calls are special in that they have a variable number
of arguments, and need to be able to clobber globals.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static bool fold_call(OptContext *ctx, TCGOp *op)
+{
+    TCGContext *s = ctx->tcg;
+    int nb_oargs = TCGOP_CALLO(op);
+    int nb_iargs = TCGOP_CALLI(op);
+    int flags, i;
+
+    init_arguments(ctx, op, nb_oargs + nb_iargs);
+    copy_propagate(ctx, op, nb_oargs, nb_iargs);
+
+    /* If the function reads or writes globals, reset temp data. */
+    flags = tcg_call_flags(op);
+    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+        int nb_globals = s->nb_globals;
+
+        for (i = 0; i < nb_globals; i++) {
+            if (test_bit(i, ctx->temps_used.l)) {
+                reset_ts(&ctx->tcg->temps[i]);
+            }
+        }
+    }
+
+    /* Reset temp data for outputs. */
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+    }
+
+    /* Stop optimizing MB across calls. */
+    ctx->prev_mb = NULL;
+    return true;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals, i;
+    int nb_temps, i;
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
        available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
-    nb_globals = s->nb_globals;
-
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
 
-        /* Count the arguments, and initialize the temps that are
-           going to be used */
+        /* Calls are special. */
         if (opc == INDEX_op_call) {
-            nb_oargs = TCGOP_CALLO(op);
-            nb_iargs = TCGOP_CALLI(op);
-        } else {
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
+            fold_call(&ctx, op);
+            continue;
         }
+
+        def = &tcg_op_defs[opc];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
-            if (opc == INDEX_op_call &&
-                !(tcg_call_flags(op)
-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-                for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, ctx.temps_used.l)) {
-                        reset_ts(&s->temps[i]);
-                    }
-                }
-            }
-
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
-            case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
                 ctx.prev_mb = NULL;
                 break;
-- 
2.25.1

Rather than try to keep these up-to-date across folding,
re-read nb_oargs at the end, after re-reading the opcode.

A couple of asserts need dropping, but that will take care
of itself as we split the function further.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t z_mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         def = &tcg_op_defs[opc];
-        nb_oargs = def->nb_oargs;
-        nb_iargs = def->nb_iargs;
-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
+        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(qemu_ld):
             {
-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         if (partmask == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
-                nb_iargs = 1;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = opc = (opc == INDEX_op_movcond_i32
                                  ? INDEX_op_setcond_i32
                                  : INDEX_op_setcond_i64);
-                nb_iargs = 2;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
+            int nb_oargs = def->nb_oargs;
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
-- 
2.25.1

Return -1 instead of 2 for failure, so that we can
use comparisons against 0 for all cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
     }
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                       TCGArg y, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                    TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         case TCG_COND_GEU:
             return 1;
         default:
-            return 2;
+            return -1;
         }
     }
-    return 2;
+    return -1;
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 {
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
         return do_constant_folding_cond_eq(c);
     }
-    return 2;
+    return -1;
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[3]);
-            if (tmp != 2) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[3]);
+            if (i >= 0) {
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             break;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(opc, op->args[0],
-                                           op->args[1], op->args[2]);
-            switch (tmp) {
-            case 0:
+            i = do_constant_folding_cond(opc, op->args[0],
+                                         op->args[1], op->args[2]);
+            if (i == 0) {
                 tcg_op_remove(s, op);
                 continue;
-            case 1:
+            } else if (i > 0) {
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[5]);
-            if (tmp != 2) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[5]);
+            if (i >= 0) {
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_brcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                            op->args[4]);
-            if (tmp == 0) {
+            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                          op->args[4]);
+            if (i == 0) {
             do_brcond_false:
                 tcg_op_remove(s, op);
                 continue;
             }
-            if (tmp == 1) {
+            if (i > 0) {
             do_brcond_true:
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
             }
             break;
 
         case INDEX_op_setcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                            op->args[5]);
-            if (tmp != 2) {
+            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+                                          op->args[5]);
+            if (i >= 0) {
             do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_const;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
             }
-- 
2.25.1

This will allow callers to tail call to these functions
and return true indicating processing complete.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 
     if (ts_are_copies(dst_ts, src_ts)) {
         tcg_op_remove(ctx->tcg, op);
-        return;
+        return true;
     }
 
     reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         di->is_const = si->is_const;
         di->val = si->val;
     }
+    return true;
 }
 
-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
-- 
2.25.1

Copy z_mask into OptContext, for writeback to the
first output within the new function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGContext *tcg;
     TCGOp *prev_mb;
     TCGTempSet temps_used;
+
+    /* In flight values from optimization. */
+    uint64_t z_mask;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_folding(OptContext *ctx, TCGOp *op)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    int i, nb_oargs;
+
+    /*
+     * For an opcode that ends a BB, reset all temp data.
+     * We do no cross-BB optimization.
+     */
+    if (def->flags & TCG_OPF_BB_END) {
+        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        ctx->prev_mb = NULL;
+        return;
+    }
+
+    nb_oargs = def->nb_oargs;
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+        /*
+         * Save the corresponding known-zero bits mask for the
+         * first output argument (only one supported so far).
+         */
+        if (i == 0) {
+            arg_info(op->args[i])->z_mask = ctx->z_mask;
+        }
+    }
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
+        ctx.z_mask = z_mask;
 
         if (partmask == 0) {
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Some of the folding above can change opc. */
-        opc = op->opc;
-        def = &tcg_op_defs[opc];
-        if (def->flags & TCG_OPF_BB_END) {
-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-        } else {
-            int nb_oargs = def->nb_oargs;
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /* Save the corresponding known-zero bits mask for the
-                   first output argument (only one supported so far). */
-                if (i == 0) {
-                    arg_info(op->args[i])->z_mask = z_mask;
-                }
-            }
-        }
+        finish_folding(&ctx, op);
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
+        bool done = false;
 
         /* Calls are special. */
         if (opc == INDEX_op_call) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
+            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        finish_folding(&ctx, op);
+        if (!done) {
+            finish_folding(&ctx, op);
+        }
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

This puts the separate mb optimization into the same framework
as the others.  While fold_qemu_{ld,st} are currently identical,
that won't last as more code gets moved.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mb(OptContext *ctx, TCGOp *op)
+{
+    /* Eliminate duplicate and redundant fence instructions.  */
+    if (ctx->prev_mb) {
+        /*
+         * Merge two barriers of the same type into one,
+         * or a weaker barrier into a stronger one,
+         * or two weaker barriers into a stronger one.
+         *   mb X; mb Y => mb X|Y
+         *   mb; strl => mb; st
+         *   ldaq; mb => ld; mb
+         *   ldaq; strl => ld; mb; st
+         * Other combinations are also merged into a strong
+         * barrier.  This is stricter than specified but for
+         * the purposes of TCG is better than not optimizing.
+         */
+        ctx->prev_mb->args[0] |= op->args[0];
+        tcg_op_remove(ctx->tcg, op);
+    } else {
+        ctx->prev_mb = op;
+    }
+    return true;
+}
+
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        case INDEX_op_mb:
+            done = fold_mb(&ctx, op);
+            break;
+        case INDEX_op_qemu_ld_i32:
+        case INDEX_op_qemu_ld_i64:
+            done = fold_qemu_ld(&ctx, op);
+            break;
+        case INDEX_op_qemu_st_i32:
+        case INDEX_op_qemu_st8_i32:
+        case INDEX_op_qemu_st_i64:
+            done = fold_qemu_st(&ctx, op);
+            break;
+
         default:
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (!done) {
             finish_folding(&ctx, op);
         }
-
-        /* Eliminate duplicate and redundant fence instructions.  */
-        if (ctx.prev_mb) {
-            switch (opc) {
-            case INDEX_op_mb:
-                /* Merge two barriers of the same type into one,
-                 * or a weaker barrier into a stronger one,
-                 * or two weaker barriers into a stronger one.
-                 *   mb X; mb Y => mb X|Y
-                 *   mb; strl => mb; st
-                 *   ldaq; mb => ld; mb
-                 *   ldaq; strl => ld; mb; st
-                 * Other combinations are also merged into a strong
-                 * barrier.  This is stricter than specified but for
-                 * the purposes of TCG is better than not optimizing.
-                 */
-                ctx.prev_mb->args[0] |= op->args[0];
-                tcg_op_remove(s, op);
-                break;
-
-            default:
-                /* Opcodes that end the block stop the optimization.  */
-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-                    break;
-                }
-                /* fallthru */
-            case INDEX_op_qemu_ld_i32:
-            case INDEX_op_qemu_ld_i64:
-            case INDEX_op_qemu_st_i32:
-            case INDEX_op_qemu_st8_i32:
-            case INDEX_op_qemu_st_i64:
-                /* Opcodes that touch guest memory stop the optimization.  */
-                ctx.prev_mb = NULL;
-                break;
-            }
-        } else if (opc == INDEX_op_mb) {
-            ctx.prev_mb = op;
-        }
     }
 }
-- 
2.25.1

Split out a whole bunch of placeholder functions, which are
currently identical.  That won't last as more code gets moved.

Use CASE_32_64_VEC for some logical operators that previously
missed the addition of vectors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 219 insertions(+), 52 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     }
 }
 
+/*
+ * The fold_* functions return true when processing is complete,
+ * usually by folding the operation to a constant or to a copy,
+ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
+ * like collect information about the value produced, for use in
+ * optimizing a subsequent operation.
+ *
+ * These first fold_* functions are all helpers, used by other
+ * folders for more specific operations.
+ */
+
+static bool fold_const1(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = do_constant_folding(op->opc, t, 0);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_const2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = do_constant_folding(op->opc, t1, t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
+/*
+ * These outermost fold_<op> functions are sorted alphabetically.
+ */
+
+static bool fold_add(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_and(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_andc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_divide(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_exts(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_extu(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
 static bool fold_mb(OptContext *ctx, TCGOp *op)
 {
     /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mul(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_nand(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_neg(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_nor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_not(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_or(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_orc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_shift(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_xor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(not):
-        CASE_OP_32_64(neg):
-        CASE_OP_32_64(ext8s):
-        CASE_OP_32_64(ext8u):
-        CASE_OP_32_64(ext16s):
-        CASE_OP_32_64(ext16u):
-        CASE_OP_32_64(ctpop):
-        case INDEX_op_ext32s_i64:
-        case INDEX_op_ext32u_i64:
-        case INDEX_op_ext_i32_i64:
-        case INDEX_op_extu_i32_i64:
-        case INDEX_op_extrl_i64_i32:
-        case INDEX_op_extrh_i64_i32:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
         case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(add):
-        CASE_OP_32_64(sub):
-        CASE_OP_32_64(mul):
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(and):
-        CASE_OP_32_64(xor):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-        CASE_OP_32_64(andc):
-        CASE_OP_32_64(orc):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-        CASE_OP_32_64(div):
-        CASE_OP_32_64(divu):
-        CASE_OP_32_64(rem):
-        CASE_OP_32_64(remu):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        default:
+            break;
+
+        /* ---------------------------------------------------------- */
+        /* Sorted alphabetically by opcode as much as possible. */
+
+        CASE_OP_32_64_VEC(add):
+            done = fold_add(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(and):
+            done = fold_and(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(andc):
+            done = fold_andc(&ctx, op);
+            break;
+        CASE_OP_32_64(ctpop):
+            done = fold_ctpop(&ctx, op);
+            break;
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+            done = fold_divide(&ctx, op);
+            break;
+        CASE_OP_32_64(eqv):
+            done = fold_eqv(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext16s):
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext_i32_i64:
+            done = fold_exts(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16u):
+        case INDEX_op_ext32u_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            done = fold_extu(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(mul):
+            done = fold_mul(&ctx, op);
+            break;
+        CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(muluh):
+            done = fold_mul_highpart(&ctx, op);
+            break;
+        CASE_OP_32_64(nand):
+            done = fold_nand(&ctx, op);
+            break;
+        CASE_OP_32_64(neg):
+            done = fold_neg(&ctx, op);
+            break;
+        CASE_OP_32_64(nor):
+            done = fold_nor(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(not):
+            done = fold_not(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(or):
+            done = fold_or(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(orc):
+            done = fold_orc(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_i32:
         case INDEX_op_qemu_ld_i64:
             done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_qemu_st_i64:
             done = fold_qemu_st(&ctx, op);
             break;
-
-        default:
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
+            done = fold_remainder(&ctx, op);
+            break;
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+            done = fold_shift(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(sub):
+            done = fold_sub(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(xor):
+            done = fold_xor(&ctx, op);
             break;
         }
 
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_setcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
+            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
+            goto do_setcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            goto do_setcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+                                     op->args[4], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            op->args[2] = op->args[3];
+            op->args[3] = cond;
+            op->opc = INDEX_op_setcond_i32;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_setcond_high:
+        op->args[1] = op->args[2];
+        op->args[2] = op->args[4];
+        op->args[3] = cond;
+        op->opc = INDEX_op_setcond_i32;
+        break;
+    }
+    return false;
+
+ do_setcond_const:
+    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_setcond2_i32:
-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                          op->args[5]);
-            if (i >= 0) {
-            do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-                continue;
-            }
-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0
-                 && arg_is_const(op->args[4])
-                 && arg_info(op->args[4])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_setcond_high:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_const;
-                } else if (i > 0) {
-                    goto do_setcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i < 0) {
-                    break;
-                }
-            do_setcond_low:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[2] = op->args[3];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_low;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(shr):
             done = fold_shift(&ctx, op);
             break;
+        case INDEX_op_setcond2_i32:
+            done = fold_setcond2(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[4];
+    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
+    TCGArg label = op->args[5];
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_brcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
+            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
+            goto do_brcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+                                     op->args[2], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            goto do_brcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            op->opc = INDEX_op_brcond_i32;
+            op->args[1] = op->args[2];
+            op->args[2] = cond;
+            op->args[3] = label;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_brcond_high:
+        op->opc = INDEX_op_brcond_i32;
+        op->args[0] = op->args[1];
+        op->args[1] = op->args[3];
+        op->args[2] = cond;
+        op->args[3] = label;
+        break;
+
+    do_brcond_const:
+        if (i == 0) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+        op->opc = INDEX_op_br;
+        op->args[0] = label;
+        break;
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_brcond2_i32:
-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                          op->args[4]);
-            if (i == 0) {
-            do_brcond_false:
-                tcg_op_remove(s, op);
-                continue;
-            }
-            if (i > 0) {
-            do_brcond_true:
-                op->opc = opc = INDEX_op_br;
-                op->args[0] = op->args[5];
-                break;
-            }
-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
-                 && arg_is_const(op->args[2])
-                 && arg_info(op->args[2])->val == 0
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_brcond_high:
-                op->opc = opc = INDEX_op_brcond_i32;
-                op->args[0] = op->args[1];
-                op->args[1] = op->args[3];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i > 0) {
-                    goto do_brcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i < 0) {
-                    break;
-                }
-            do_brcond_low:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_high;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_low;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(andc):
             done = fold_andc(&ctx, op);
             break;
+        case INDEX_op_brcond2_i32:
+            done = fold_brcond2(&ctx, op);
+            break;
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint32_t a = arg_info(op->args[2])->val;
+        uint32_t b = arg_info(op->args[3])->val;
+        uint64_t r = (uint64_t)a * b;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+        return true;
+    }
+    return false;
+}
+
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_mulu2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-                uint32_t a = arg_info(op->args[2])->val;
-                uint32_t b = arg_info(op->args[3])->val;
-                uint64_t r = (uint64_t)a * b;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
+        case INDEX_op_mulu2_i32:
+            done = fold_mulu2_i32(&ctx, op);
+            break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
             break;
-- 
2.25.1

Add two additional helpers, fold_add2_i32 and fold_sub2_i32
which will not be simple wrappers forever.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+        uint32_t al = arg_info(op->args[2])->val;
+        uint32_t ah = arg_info(op->args[3])->val;
+        uint32_t bl = arg_info(op->args[4])->val;
+        uint32_t bh = arg_info(op->args[5])->val;
+        uint64_t a = ((uint64_t)ah << 32) | al;
+        uint64_t b = ((uint64_t)bh << 32) | bl;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        if (add) {
+            a += b;
+        } else {
+            a -= b;
+        }
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+        return true;
+    }
+    return false;
+}
+
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, true);
+}
+
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, false);
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_add2_i32:
-        case INDEX_op_sub2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-                uint32_t al = arg_info(op->args[2])->val;
-                uint32_t ah = arg_info(op->args[3])->val;
-                uint32_t bl = arg_info(op->args[4])->val;
-                uint32_t bh = arg_info(op->args[5])->val;
-                uint64_t a = ((uint64_t)ah << 32) | al;
-                uint64_t b = ((uint64_t)bh << 32) | bl;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                if (opc == INDEX_op_add2_i32) {
-                    a += b;
-                } else {
-                    a -= b;
-                }
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-                continue;
-            }
-            break;
 
         default:
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add2_i32:
+            done = fold_add2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub2_i32:
+            done = fold_sub2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode opc = op->opc;
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+
+    if (i >= 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+    }
+
+    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+        uint64_t tv = arg_info(op->args[3])->val;
+        uint64_t fv = arg_info(op->args[4])->val;
+
+        opc = (opc == INDEX_op_movcond_i32
+               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+
+        if (tv == 1 && fv == 0) {
+            op->opc = opc;
+            op->args[3] = cond;
+        } else if (fv == 1 && tv == 0) {
+            op->opc = opc;
+            op->args[3] = tcg_invert_cond(cond);
+        }
+    }
+    return false;
+}
+
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(movcond):
-            i = do_constant_folding_cond(opc, op->args[1],
-                                         op->args[2], op->args[5]);
-            if (i >= 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-                continue;
-            }
-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                uint64_t tv = arg_info(op->args[3])->val;
-                uint64_t fv = arg_info(op->args[4])->val;
-                TCGCond cond = op->args[5];
-
-                if (fv == 1 && tv == 0) {
-                    cond = tcg_invert_cond(cond);
-                } else if (!(tv == 1 && fv == 0)) {
-                    break;
-                }
-                op->args[3] = cond;
-                op->opc = opc = (opc == INDEX_op_movcond_i32
-                                 ? INDEX_op_setcond_i32
-                                 : INDEX_op_setcond_i64);
-            }
-            break;
-
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(movcond):
+            done = fold_movcond(&ctx, op);
+            break;
         CASE_OP_32_64(mul):
             done = fold_mul(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t v1 = arg_info(op->args[1])->val;
+        uint64_t v2 = arg_info(op->args[2])->val;
+        int shr = op->args[3];
+
+        if (op->opc == INDEX_op_extract2_i64) {
+            v1 >>= shr;
+            v2 <<= 64 - shr;
+        } else {
+            v1 = (uint32_t)v1 >> shr;
+            v2 = (int32_t)v2 << (32 - shr);
+        }
+        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+    }
+    return false;
+}
+
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
     return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract2):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                uint64_t v1 = arg_info(op->args[1])->val;
-                uint64_t v2 = arg_info(op->args[2])->val;
-                int shr = op->args[3];
-
-                if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-                } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-                                    ((uint32_t)v2 << (32 - shr)));
-                }
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract2):
+            done = fold_extract2(&ctx, op);
+            break;
         CASE_OP_32_64(ext8s):
         CASE_OP_32_64(ext16s):
         case INDEX_op_ext32s_i64:
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = extract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = sextract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract):
-            if (arg_is_const(op->args[1])) {
-                tmp = extract64(arg_info(op->args[1])->val,
-                                op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        CASE_OP_32_64(sextract):
-            if (arg_is_const(op->args[1])) {
-                tmp = sextract64(arg_info(op->args[1])->val,
-                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract):
+            done = fold_extract(&ctx, op);
+            break;
         CASE_OP_32_64(extract2):
             done = fold_extract2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_setcond2_i32:
             done = fold_setcond2(&ctx, op);
             break;
+        CASE_OP_32_64(sextract):
+            done = fold_sextract(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     return fold_const1(ctx, op);
 }
 
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
 static bool fold_divide(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(deposit):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = deposit64(arg_info(op->args[1])->val,
-                                op->args[3], op->args[4],
-                                arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
+        CASE_OP_32_64(deposit):
+            done = fold_deposit(&ctx, op);
+            break;
         CASE_OP_32_64(div):
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+
+        t = do_constant_folding(op->opc, t, op->args[2]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(bswap16):
-        CASE_OP_32_64(bswap32):
-        case INDEX_op_bswap64_i64:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          op->args[2]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_brcond2_i32:
             done = fold_brcond2(&ctx, op);
             break;
+        CASE_OP_32_64(bswap16):
+        CASE_OP_32_64(bswap32):
+        case INDEX_op_bswap64_i64:
+            done = fold_bswap(&ctx, op);
+            break;
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             done = fold_count_zeros(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_dup(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+        t = dup_const(TCGOP_VECE(op), t);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
+                               arg_info(op->args[2])->val);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+
+    if (args_are_copies(op->args[1], op->args[2])) {
+        op->opc = INDEX_op_dup_vec;
+        TCGOP_VECE(op) = MO_32;
+    }
+    return false;
+}
+
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             break;
 
-        case INDEX_op_dup_vec:
-            if (arg_is_const(op->args[1])) {
-                tmp = arg_info(op->args[1])->val;
-                tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        case INDEX_op_dup2_vec:
-            assert(TCG_TARGET_REG_BITS == 32);
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0],
-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
-                                           arg_info(op->args[2])->val));
-                continue;
-            } else if (args_are_copies(op->args[1], op->args[2])) {
-                op->opc = INDEX_op_dup_vec;
-                TCGOP_VECE(op) = MO_32;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
             break;
+        case INDEX_op_dup_vec:
+            done = fold_dup(&ctx, op);
+            break;
+        case INDEX_op_dup2_vec:
+            done = fold_dup2(&ctx, op);
+            break;
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
-- 
2.25.1

This is the final entry in the main switch that was in a
different form.  After this, we have the option to convert
the switch into a function dispatch table.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mov(OptContext *ctx, TCGOp *op)
+{
+    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+}
+
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /*
+         * Process each opcode.
+         * Sorted alphabetically by opcode as much as possible.
+         */
         switch (opc) {
-        CASE_OP_32_64_VEC(mov):
-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            break;
-
-        default:
-            break;
-
-        /* ---------------------------------------------------------- */
-        /* Sorted alphabetically by opcode as much as possible. */
-
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64_VEC(mov):
+            done = fold_mov(&ctx, op);
+            break;
         CASE_OP_32_64(movcond):
             done = fold_movcond(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        default:
+            break;
         }
 
         if (!done) {
-- 
2.25.1

Pull the "op r, a, a => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to @i. */
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(xor):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, a => mov r, a" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to identity. */
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
+ *
+ * The ordering of the transformations should be:
+ *   1) those that produce a constant
+ *   2) those that produce a copy
+ *   3) those that produce information about the result value.
  */
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(and):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, 0 => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to @i. */
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             continue;
         }
 
-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            if (arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Compute the type of the operation early.

There are at least 4 places that used a def->flags ladder
to determine the type of the operation being optimized.

There were two places that assumed !TCG_OPF_64BIT means
TCG_TYPE_I32, and so could potentially compute incorrect
results for vector operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;
+    TCGType type;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
-    const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
     uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     reset_ts(dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_mov_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_mov_i64;
-    } else {
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
         new_op = INDEX_op_mov_i32;
+        break;
+    case TCG_TYPE_I64:
+        new_op = INDEX_op_mov_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
+        new_op = INDEX_op_mov_vec;
+        break;
+    default:
+        g_assert_not_reached();
     }
     op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
     op->args[0] = dst;
     op->args[1] = src;
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    const TCGOpDef *def = &tcg_op_defs[op->opc];
-    TCGType type;
-    TCGTemp *tv;
-
-    if (def->flags & TCG_OPF_VECTOR) {
-        type = TCGOP_VECL(op) + TCG_TYPE_V64;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        type = TCG_TYPE_I64;
-    } else {
-        type = TCG_TYPE_I32;
-    }
-
     /* Convert movi to mov with constant temp. */
-    tv = tcg_constant_internal(type, val);
+    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     }
 }
 
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
+                                    uint64_t x, uint64_t y)
 {
-    const TCGOpDef *def = &tcg_op_defs[op];
     uint64_t res = do_constant_folding_2(op, x, y);
-    if (!(def->flags & TCG_OPF_64BIT)) {
+    if (type == TCG_TYPE_I32) {
         res = (int32_t)res;
     }
     return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
  * Return -1 if the condition can't be simplified,
  * and the result of the condition (0 or 1) if it can.
  */
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+static int do_constant_folding_cond(TCGType type, TCGArg x,
                                     TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
 
     if (arg_is_const(x) && arg_is_const(y)) {
-        const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-        if (def->flags & TCG_OPF_64BIT) {
-            return do_constant_folding_cond_64(xv, yv, c);
-        } else {
+        switch (type) {
+        case TCG_TYPE_I32:
             return do_constant_folding_cond_32(xv, yv, c);
+        case TCG_TYPE_I64:
+            return do_constant_folding_cond_64(xv, yv, c);
+        default:
+            /* Only scalar comparisons are optimizable */
+            return -1;
         }
     } else if (args_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = do_constant_folding(op->opc, t, 0);
+        t = do_constant_folding(op->opc, ctx->type, t, 0);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
         uint64_t t1 = arg_info(op->args[1])->val;
         uint64_t t2 = arg_info(op->args[2])->val;
 
-        t1 = do_constant_folding(op->opc, t1, t2);
+        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                      op->args[2], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
             goto do_brcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
-        t = do_constant_folding(op->opc, t, op->args[2]);
+        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         uint64_t t = arg_info(op->args[1])->val;
 
         if (t != 0) {
-            t = do_constant_folding(op->opc, t, 0);
+            t = do_constant_folding(op->opc, ctx->type, t, 0);
             return tcg_opt_gen_movi(ctx, op, op->args[0], t);
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
-    TCGOpcode opc = op->opc;
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
+        TCGOpcode opc;
 
-        opc = (opc == INDEX_op_movcond_i32
-               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+        switch (ctx->type) {
+        case TCG_TYPE_I32:
+            opc = INDEX_op_setcond_i32;
+            break;
+        case TCG_TYPE_I64:
+            opc = INDEX_op_setcond_i64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         if (tv == 1 && fv == 0) {
             op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
             goto do_setcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                      op->args[4], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
         copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
+        /* Pre-compute the type of the operation. */
+        if (def->flags & TCG_OPF_VECTOR) {
+            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
+        } else if (def->flags & TCG_OPF_64BIT) {
+            ctx.type = TCG_TYPE_I64;
+        } else {
+            ctx.type = TCG_TYPE_I32;
+        }
+
         /* For commutative operations make constant second argument */
         switch (opc) {
         CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (opc == INDEX_op_sub_i32) {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
-                } else if (opc == INDEX_op_sub_i64) {
+                    break;
+                case TCG_TYPE_I64:
                     neg_op = INDEX_op_neg_i64;
                     have_neg = TCG_TARGET_HAS_neg_i64;
-                } else if (TCG_TARGET_HAS_neg_vec) {
-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
-                    unsigned vece = TCGOP_VECE(op);
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-                } else {
                     break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    neg_op = INDEX_op_neg_vec;
+                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+                                                   TCGOP_VECE(op)) > 0;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_neg) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode not_op;
                 bool have_not;
 
-                if (def->flags & TCG_OPF_VECTOR) {
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                } else if (def->flags & TCG_OPF_64BIT) {
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                } else {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     not_op = INDEX_op_not_i32;
                     have_not = TCG_TARGET_HAS_not_i32;
+                    break;
+                case TCG_TYPE_I64:
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                    break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    not_op = INDEX_op_not_vec;
+                    have_not = TCG_TARGET_HAS_not_vec;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_not) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
         partmask = z_mask;
-        if (!(def->flags & TCG_OPF_64BIT)) {
+        if (ctx.type == TCG_TYPE_I32) {
             z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
-- 
2.25.1

Split out the conditional conversion from a more complex logical
operation to a simple NOT.  Create a couple more helpers to make
this easy for the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/*
+ * Convert @op to NOT, if NOT is supported by the host.
+ * Return true f the conversion is successful, which will still
+ * indicate that the processing is complete.
+ */
+static bool fold_not(OptContext *ctx, TCGOp *op);
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+{
+    TCGOpcode not_op;
+    bool have_not;
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        not_op = INDEX_op_not_i32;
+        have_not = TCG_TARGET_HAS_not_i32;
+        break;
+    case TCG_TYPE_I64:
+        not_op = INDEX_op_not_i64;
+        have_not = TCG_TARGET_HAS_not_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        not_op = INDEX_op_not_vec;
+        have_not = TCG_TARGET_HAS_not_vec;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_not) {
+        op->opc = not_op;
+        op->args[1] = op->args[idx];
+        return fold_not(ctx, op);
+    }
+    return false;
+}
+
+/* If the binary operation has first argument @i, fold to NOT. */
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return fold_to_not(ctx, op, 2);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to @i. */
 static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to NOT. */
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return fold_to_not(ctx, op, 1);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, -1)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_not(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    /* Because of fold_to_not, we want to always return true, via finish. */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_ix_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             }
             break;
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(nand):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64(nor):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(andc):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == -1) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        try_not:
-            {
-                TCGOpcode not_op;
-                bool have_not;
-
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    not_op = INDEX_op_not_i32;
-                    have_not = TCG_TARGET_HAS_not_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_not) {
-                    break;
-                }
-                op->opc = not_op;
-                reset_temp(op->args[0]);
-                op->args[1] = op->args[i];
-                continue;
-            }
         default:
             break;
         }
-- 
2.25.1

Even though there is only one user, place this more complex
conversion into its own helper.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+    /*
+     * Because of fold_sub_to_neg, we want to always return true,
+     * via finish_folding.
+     */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode neg_op;
+    bool have_neg;
+
+    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
+        return false;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        neg_op = INDEX_op_neg_i32;
+        have_neg = TCG_TARGET_HAS_neg_i32;
+        break;
+    case TCG_TYPE_I64:
+        neg_op = INDEX_op_neg_i64;
+        have_neg = TCG_TARGET_HAS_neg_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        neg_op = INDEX_op_neg_vec;
+        have_neg = (TCG_TARGET_HAS_neg_vec &&
+                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_neg) {
+        op->opc = neg_op;
+        op->args[1] = op->args[2];
+        return fold_neg(ctx, op);
+    }
+    return false;
+}
+
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 continue;
             }
             break;
-        CASE_OP_32_64_VEC(sub):
-            {
-                TCGOpcode neg_op;
-                bool have_neg;
-
-                if (arg_is_const(op->args[2])) {
-                    /* Proceed with possible constant folding. */
-                    break;
-                }
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    neg_op = INDEX_op_neg_i32;
-                    have_neg = TCG_TARGET_HAS_neg_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    neg_op = INDEX_op_neg_i64;
-                    have_neg = TCG_TARGET_HAS_neg_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-                                                   TCGOP_VECE(op)) > 0;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_neg) {
-                    break;
-                }
-                if (arg_is_const(op->args[1])
-                    && arg_info(op->args[1])->val == 0) {
-                    op->opc = neg_op;
-                    reset_temp(op->args[0]);
-                    op->args[1] = op->args[2];
-                    continue;
-                }
-            }
-            break;
         default:
             break;
         }
-- 
2.25.1

Pull the "op r, a, i => mov r, a" optimization into a function,
and use them in the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to identity. */
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to NOT. */
 static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Pull the "op r, 0, b => movi r, 0" optimization into a function,
and use it in fold_shift.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
     return false;
 }
 
+/* If the binary operation has first argument @i, fold to @i. */
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has first argument @i, fold to NOT. */
 static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
-           and "sub r, 0, a => neg r, a" case.  */
-        switch (opc) {
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Move all of the known-zero optimizations into the per-opcode
functions.  Use fold_masks when there is a possibility of the
result being determined, and simply set ctx->z_mask otherwise.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 251 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGTempSet temps_used;
 
     /* In flight values from optimization. */
-    uint64_t z_mask;
+    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     TCGType type;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    uint64_t a_mask = ctx->a_mask;
+    uint64_t z_mask = ctx->z_mask;
+
+    /*
+     * 32-bit ops generate 32-bit results.  For the result is zero test
+     * below, we can ignore high bits, but for further optimizations we
+     * need to record that the high bits contain garbage.
+     */
+    if (ctx->type == TCG_TYPE_I32) {
+        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+        a_mask &= MAKE_64BIT_MASK(0, 32);
+        z_mask &= MAKE_64BIT_MASK(0, 32);
+    }
+
+    if (z_mask == 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+    }
+    if (a_mask == 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * Convert @op to NOT, if NOT is supported by the host.
  * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1, z2;
+
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+    z2 = arg_info(op->args[2])->z_mask;
+    ctx->z_mask = z1 & z2;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer affected bits from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        ctx->a_mask = z1 & ~z2;
+    }
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer anything from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+        ctx->a_mask = z1 & ~z2;
+        z1 &= z2;
+    }
+    ctx->z_mask = z1;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, sign;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask = arg_info(op->args[1])->z_mask;
+    switch (op->opc) {
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+        z_mask = bswap16(z_mask);
+        sign = INT16_MIN;
+        break;
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+        z_mask = bswap32(z_mask);
+        sign = INT32_MIN;
+        break;
+    case INDEX_op_bswap64_i64:
+        z_mask = bswap64(z_mask);
+        sign = INT64_MIN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+    case TCG_BSWAP_OZ:
+        break;
+    case TCG_BSWAP_OS:
+        /* If the sign bit may be 1, force all the bits above to 1. */
+        if (z_mask & sign) {
+            z_mask |= sign;
+        }
+        break;
+    default:
+        /* The high bits are undefined: force all bits above the sign to 1. */
+        z_mask |= sign << 1;
+        break;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
     }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        z_mask = 31;
+        break;
+    case TCG_TYPE_I64:
+        z_mask = 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+
     return false;
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        ctx->z_mask = 32 | 31;
+        break;
+    case TCG_TYPE_I64:
+        ctx->z_mask = 64 | 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         t1 = deposit64(t1, op->args[3], op->args[4], t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
+
+    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                            op->args[3], op->args[4],
+                            arg_info(op->args[2])->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         t = extract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask, sign;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8s):
+        sign = INT8_MIN;
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16s):
+        sign = INT16_MIN;
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_ext_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32s_i64:
+        sign = INT32_MIN;
+        z_mask = (uint32_t)z_mask;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (z_mask & sign) {
+        z_mask |= sign;
+    } else if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8u):
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16u):
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extu_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32u_i64:
+        z_mask = (uint32_t)z_mask;
+        break;
+    case INDEX_op_extrh_i64_i32:
+        type_change = true;
+        z_mask >>= 32;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ctx->z_mask = z_mask;
+    if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    return fold_masks(ctx, op);
 }
 
 static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
+    ctx->z_mask = arg_info(op->args[3])->z_mask
+                | arg_info(op->args[4])->z_mask;
+
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
+
+    /* Set to 1 all bits to the left of the rightmost.  */
+    z_mask = arg_info(op->args[1])->z_mask;
+    ctx->z_mask = -(z_mask & -z_mask);
+
     /*
      * Because of fold_sub_to_neg, we want to always return true,
      * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+    MemOp mop = get_memop(oi);
+    int width = 8 * memop_size(mop);
+
+    if (!(mop & MO_SIGN) && width < 64) {
+        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    }
+
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
+
+    ctx->z_mask = 1;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
         op->opc = INDEX_op_setcond_i32;
         break;
     }
+
+    ctx->z_mask = 1;
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
+    int64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
         t = sextract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0 && z_mask >= 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
+
+    if (arg_is_const(op->args[2])) {
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
+                                          arg_info(op->args[1])->z_mask,
+                                          arg_info(op->args[2])->val);
+        return fold_masks(ctx, op);
+    }
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
     return fold_addsub2_i32(ctx, op, false);
 }
 
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+{
+    /* We can't do any folding with a load, but we can record bits. */
+    switch (op->opc) {
+    CASE_OP_32_64(ld8u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        break;
+    CASE_OP_32_64(ld16u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        break;
+    case INDEX_op_ld32u_i64:
+        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
         bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify using known-zero bits. Currently only ops with a single
-           output argument is supported. */
-        z_mask = -1;
-        affected = -1;
-        switch (opc) {
-        CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext8u):
-            z_mask = 0xff;
-            goto and_const;
-        CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext16u):
-            z_mask = 0xffff;
-            goto and_const;
-        case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_ext32u_i64:
-            z_mask = 0xffffffffU;
-            goto and_const;
-
-        CASE_OP_32_64(and):
-            z_mask = arg_info(op->args[2])->z_mask;
-            if (arg_is_const(op->args[2])) {
-        and_const:
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-            break;
-
-        case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_extu_i32_i64:
-            /* We do not compute affected as it is a size changing op.  */
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-
-        CASE_OP_32_64(andc):
-            /* Known-zeros does not imply known-ones.  Therefore unless
-               op->args[2] is constant, we can't infer anything from it.  */
-            if (arg_is_const(op->args[2])) {
-                z_mask = ~arg_info(op->args[2])->z_mask;
-                goto and_const;
-            }
-            /* But we certainly know nothing outside args[1] may be set. */
-            z_mask = arg_info(op->args[1])->z_mask;
-            break;
-
-        case INDEX_op_sar_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_sar_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_shr_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_shr_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_extrl_i64_i32:
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-        case INDEX_op_extrh_i64_i32:
-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-            break;
-
-        CASE_OP_32_64(shl):
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                z_mask = arg_info(op->args[1])->z_mask << tmp;
-            }
-            break;
-
-        CASE_OP_32_64(neg):
-            /* Set to 1 all bits to the left of the rightmost.  */
-            z_mask = -(arg_info(op->args[1])->z_mask
-                       & -arg_info(op->args[1])->z_mask);
-            break;
-
-        CASE_OP_32_64(deposit):
-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                               op->args[3], op->args[4],
-                               arg_info(op->args[2])->z_mask);
-            break;
-
-        CASE_OP_32_64(extract):
-            z_mask = extract64(arg_info(op->args[1])->z_mask,
-                               op->args[2], op->args[3]);
-            if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-        CASE_OP_32_64(sextract):
-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-                                op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(xor):
-            z_mask = arg_info(op->args[1])->z_mask
-                   | arg_info(op->args[2])->z_mask;
-            break;
-
-        case INDEX_op_clz_i32:
-        case INDEX_op_ctz_i32:
-            z_mask = arg_info(op->args[2])->z_mask | 31;
-            break;
-
-        case INDEX_op_clz_i64:
-        case INDEX_op_ctz_i64:
-            z_mask = arg_info(op->args[2])->z_mask | 63;
-            break;
-
-        case INDEX_op_ctpop_i32:
-            z_mask = 32 | 31;
-            break;
-        case INDEX_op_ctpop_i64:
-            z_mask = 64 | 63;
-            break;
-
-        CASE_OP_32_64(setcond):
-        case INDEX_op_setcond2_i32:
-            z_mask = 1;
-            break;
-
-        CASE_OP_32_64(movcond):
-            z_mask = arg_info(op->args[3])->z_mask
-                   | arg_info(op->args[4])->z_mask;
-            break;
-
-        CASE_OP_32_64(ld8u):
-            z_mask = 0xff;
-            break;
-        CASE_OP_32_64(ld16u):
-            z_mask = 0xffff;
-            break;
-        case INDEX_op_ld32u_i64:
-            z_mask = 0xffffffffu;
-            break;
-
-        CASE_OP_32_64(qemu_ld):
-            {
-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-                MemOp mop = get_memop(oi);
-                if (!(mop & MO_SIGN)) {
-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-                }
-            }
-            break;
-
-        CASE_OP_32_64(bswap16):
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffff) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap16(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int16_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(16, 48);
-                break;
-            }
-            break;
-
-        case INDEX_op_bswap32_i64:
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffffffffu) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap32(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int32_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(32, 32);
-                break;
-            }
-            break;
-
-        default:
-            break;
-        }
-
-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-           below, we can ignore high bits, but for further optimizations we
-           need to record that the high bits contain garbage.  */
-        partmask = z_mask;
-        if (ctx.type == TCG_TYPE_I32) {
-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-            partmask &= 0xffffffffu;
-            affected &= 0xffffffffu;
-        }
-        ctx.z_mask = z_mask;
-
-        if (partmask == 0) {
-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-            continue;
-        }
-        if (affected == 0) {
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
-        }
+        /* Assume all bits affected, and no bits known zero. */
+        ctx.a_mask = -1;
+        ctx.z_mask = -1;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32u_i64:
+            done = fold_tcg_ld(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.25.1

Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
and muls2_i64.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint32_t a = arg_info(op->args[2])->val;
-        uint32_t b = arg_info(op->args[3])->val;
-        uint64_t r = (uint64_t)a * b;
+        uint64_t a = arg_info(op->args[2])->val;
+        uint64_t b = arg_info(op->args[3])->val;
+        uint64_t h, l;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
+
+        switch (op->opc) {
+        case INDEX_op_mulu2_i32:
+            l = (uint64_t)(uint32_t)a * (uint32_t)b;
+            h = (int32_t)(l >> 32);
+            l = (int32_t)l;
+            break;
+        case INDEX_op_muls2_i32:
+            l = (int64_t)(int32_t)a * (int32_t)b;
+            h = l >> 32;
+            l = (int32_t)l;
+            break;
+        case INDEX_op_mulu2_i64:
+            mulu64(&l, &h, a, b);
+            break;
+        case INDEX_op_muls2_i64:
+            muls64(&l, &h, a, b);
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, l);
+        tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
-        case INDEX_op_mulu2_i32:
-            done = fold_mulu2_i32(&ctx, op);
+        CASE_OP_32_64(muls2):
+        CASE_OP_32_64(mulu2):
+            done = fold_multiply2(&ctx, op);
             break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
-- 
2.25.1

Rename to fold_addsub2.
Use Int128 to implement the wider operation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/int128.h"
 #include "tcg/tcg-op.h"
 #include "tcg-internal.h"
 
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-        uint32_t al = arg_info(op->args[2])->val;
-        uint32_t ah = arg_info(op->args[3])->val;
-        uint32_t bl = arg_info(op->args[4])->val;
-        uint32_t bh = arg_info(op->args[5])->val;
-        uint64_t a = ((uint64_t)ah << 32) | al;
-        uint64_t b = ((uint64_t)bh << 32) | bl;
+        uint64_t al = arg_info(op->args[2])->val;
+        uint64_t ah = arg_info(op->args[3])->val;
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
 
-        if (add) {
-            a += b;
+        if (ctx->type == TCG_TYPE_I32) {
+            uint64_t a = deposit64(al, 32, 32, ah);
+            uint64_t b = deposit64(bl, 32, 32, bh);
+
+            if (add) {
+                a += b;
+            } else {
+                a -= b;
+            }
+
+            al = sextract64(a, 0, 32);
+            ah = sextract64(a, 32, 32);
         } else {
-            a -= b;
+            Int128 a = int128_make128(al, ah);
+            Int128 b = int128_make128(bl, bh);
+
+            if (add) {
+                a = int128_add(a, b);
+            } else {
+                a = int128_sub(a, b);
+            }
+
+            al = int128_getlo(a);
+            ah = int128_gethi(a);
         }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, al);
+        tcg_opt_gen_movi(ctx, op2, rh, ah);
         return true;
     }
     return false;
 }
 
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, true);
+    return fold_addsub2(ctx, op, true);
 }
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, false);
+    return fold_addsub2(ctx, op, false);
 }
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
-        case INDEX_op_add2_i32:
-            done = fold_add2_i32(&ctx, op);
+        CASE_OP_32_64(add2):
+            done = fold_add2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-        case INDEX_op_sub2_i32:
-            done = fold_sub2_i32(&ctx, op);
+        CASE_OP_32_64(sub2):
+            done = fold_sub2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
-- 
2.25.1

Most of these are handled by creating a fold_const2_commutative
to handle all of the binary operators.  The rest were already
handled on a case-by-case basis in the switch, and have their
own fold function in which to place the call.

We now have only one major switch on TCGOpcode.

Introduce NO_DEST and a block comment for swap_commutative in
order to make the handling of brcond and movcond opcodes cleaner.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 1 file changed, 70 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     return -1;
 }
 
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
     return fold_addsub2(ctx, op, true);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     uint64_t z1, z2;
 
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     if (i >= 0) {
         goto do_brcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
         uint64_t a = arg_info(op->args[2])->val;
         uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
     if (i >= 0) {
         goto do_setcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Assume all bits affected, and no bits known zero. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
-- 
2.25.1

This "garbage" setting pre-dates the addition of the type
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
and INDEX_op_extr{l,h}_i64_i32.

So now we have a definitive points at which to adjust z_mask
to eliminate such bits from the 32-bit operands.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *src_ts = arg_temp(src);
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+    TCGTemp *tv;
 
+    if (ctx->type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(ctx->type, val);
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     uint64_t z_mask = ctx->z_mask;
 
     /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        ctx->z_mask = z_mask;
     }
 
     if (z_mask == 0) {
-- 
2.25.1

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1

Sign repetitions are perforce all identical, whether they are 1 or 0.
Bitwise operations preserve the relative quantity of the repetitions.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
     z2 = arg_info(op->args[2])->z_mask;
     ctx->z_mask = z1 & z2;
 
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     }
     ctx->z_mask = z1;
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[3])->z_mask
                 | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
 
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
         return true;
     }
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
     /* Because of fold_to_not, we want to always return true, via finish. */
     finish_folding(ctx, op);
     return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
-- 
2.25.1

For constant shifts, we can simply shift the s_mask.

For variable shifts, we know that sar does not reduce
the s_mask, which helps for sequences like

ext32s_i64  t, in
    sar_i64     t, t, v
    ext32s_i64  out, t

allowing the final extend to be eliminated.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
     return ~(~0ull >> rep);
 }
 
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask, z_mask, sign;
+
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
 
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
     if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
         return fold_masks(ctx, op);
     }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
     return false;
 }
 
-- 
2.25.1