Series comparison

-[PATCH 00/43] tcg patch queue
+[PULL 00/56] tcg patch queue
-This patch collection contains:
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
-  * A couple of fixes for i386 host vector support.
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
-  * Some random cleanups cherry-picked from some inactive branches.
+are available in the Git repository at:
-  * A reposting (with fix) of my "better handling of constants" set:
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
-    https://lists.nongnu.org/archive/html/qemu-devel/2020-05/msg02152.html
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-  * A couple patches that centralizes the set of host constraints.
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
     This, I believe is slightly cleaner than the current state of
     afairs, even before the ultimtate goal of pre-validating the
     contents as well.
+----------------------------------------------------------------
+Improvements to qemu/int128
+Fixes for 128/64 division.
+Cleanup tcg/optimize.c
+Optimize redundant sign extensions
-r~
+----------------------------------------------------------------
 Frédéric Pétrot (1):
       qemu/int128: Add int128_{not,xor}
+Luis Pires (4):
+      host-utils: move checks out of divu128/divs128
+      host-utils: move udiv_qrnnd() to host-utils
+      host-utils: add 128-bit quotient support to divu128/divs128
+      host-utils: add unit tests for divu128/divs128
-Richard Henderson (43):
+Richard Henderson (51):
-  tcg: Adjust simd_desc size encoding
+      tcg/optimize: Rename "mask" to "z_mask"
-  tcg: Drop union from TCGArgConstraint
+      tcg/optimize: Split out OptContext
-  tcg: Move sorted_args into TCGArgConstraint.sort_index
+      tcg/optimize: Remove do_default label
-  tcg: Remove TCG_CT_REG
+      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
-  tcg: Move some TCG_CT_* bits to TCGArgConstraint bitfields
+      tcg/optimize: Move prev_mb into OptContext
-  tcg: Remove TCGOpDef.used
+      tcg/optimize: Split out init_arguments
-  tcg/i386: Fix dupi for avx2 32-bit hosts
+      tcg/optimize: Split out copy_propagate
-  tcg: Fix generation of dupi_vec for 32-bit host
+      tcg/optimize: Split out fold_call
-  tcg/optimize: Fold dup2_vec
+      tcg/optimize: Drop nb_oargs, nb_iargs locals
-  tcg: Remove TCG_TARGET_HAS_cmp_vec
+      tcg/optimize: Change fail return for do_constant_folding_cond*
-  tcg: Use tcg_out_dupi_vec from temp_load
+      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
-  tcg: Increase tcg_out_dupi_vec immediate to int64_t
+      tcg/optimize: Split out finish_folding
-  tcg: Consolidate 3 bits into enum TCGTempKind
+      tcg/optimize: Use a boolean to avoid a mass of continues
-  tcg: Add temp_readonly
+      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
-  tcg: Expand TCGTemp.val to 64-bits
+      tcg/optimize: Split out fold_const{1,2}
-  tcg: Rename struct tcg_temp_info to TempOptInfo
+      tcg/optimize: Split out fold_setcond2
-  tcg: Expand TempOptInfo to 64-bits
+      tcg/optimize: Split out fold_brcond2
-  tcg: Introduce TYPE_CONST temporaries
+      tcg/optimize: Split out fold_brcond
-  tcg/optimize: Improve find_better_copy
+      tcg/optimize: Split out fold_setcond
-  tcg/optimize: Adjust TempOptInfo allocation
+      tcg/optimize: Split out fold_mulu2_i32
-  tcg/optimize: Use tcg_constant_internal with constant folding
+      tcg/optimize: Split out fold_addsub2_i32
-  tcg: Convert tcg_gen_dupi_vec to TCG_CONST
+      tcg/optimize: Split out fold_movcond
-  tcg: Use tcg_constant_i32 with icount expander
+      tcg/optimize: Split out fold_extract2
-  tcg: Use tcg_constant_{i32,i64} with tcg int expanders
+      tcg/optimize: Split out fold_extract, fold_sextract
-  tcg: Use tcg_constant_{i32,i64} with tcg plugins
+      tcg/optimize: Split out fold_deposit
-  tcg: Use tcg_constant_{i32,i64,vec} with gvec expanders
+      tcg/optimize: Split out fold_count_zeros
-  tcg/tci: Add special tci_movi_{i32,i64} opcodes
+      tcg/optimize: Split out fold_bswap
-  tcg: Remove movi and dupi opcodes
+      tcg/optimize: Split out fold_dup, fold_dup2
-  tcg: Add tcg_reg_alloc_dup2
+      tcg/optimize: Split out fold_mov
-  tcg/i386: Use tcg_constant_vec with tcg vec expanders
+      tcg/optimize: Split out fold_xx_to_i
-  tcg: Remove tcg_gen_dup{8,16,32,64}i_vec
+      tcg/optimize: Split out fold_xx_to_x
-  tcg/ppc: Use tcg_constant_vec with tcg vec expanders
+      tcg/optimize: Split out fold_xi_to_i
-  tcg/aarch64: Use tcg_constant_vec with tcg vec expanders
+      tcg/optimize: Add type to OptContext
-  tcg: Add tcg-constr.c.inc
+      tcg/optimize: Split out fold_to_not
-  tcg/i386: Convert to tcg-constr.c.inc
+      tcg/optimize: Split out fold_sub_to_neg
-  tcg/aarch64: Convert to tcg-constr.c.inc
+      tcg/optimize: Split out fold_xi_to_x
-  tcg/arm: Convert to tcg-constr.c.inc
+      tcg/optimize: Split out fold_ix_to_i
-  tcg/mips: Convert to tcg-constr.c.inc
+      tcg/optimize: Split out fold_masks
-  tcg/ppc: Convert to tcg-constr.c.inc
+      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
-  tcg/riscv: Convert to tcg-constr.c.inc
+      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-  tcg/s390: Convert to tcg-constr.c.inc
+      tcg/optimize: Sink commutative operand swapping into fold functions
-  tcg/sparc: Convert to tcg-constr.c.inc
+      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
-  tcg/tci: Convert to tcg-constr.c.inc
+      tcg/optimize: Use fold_xx_to_i for orc
       tcg/optimize: Use fold_xi_to_x for mul
       tcg/optimize: Use fold_xi_to_x for div
       tcg/optimize: Use fold_xx_to_i for rem
       tcg/optimize: Optimize sign extensions
       tcg/optimize: Propagate sign info for logical operations
       tcg/optimize: Propagate sign info for setcond
       tcg/optimize: Propagate sign info for bit counting
       tcg/optimize: Propagate sign info for shifting
- include/exec/gen-icount.h       |  25 +-
+ include/fpu/softfloat-macros.h |   82 --
- include/tcg/tcg-gvec-desc.h     |  38 ++-
+ include/hw/clock.h             |    5 +-
- include/tcg/tcg-op.h            |  17 +-
+ include/qemu/host-utils.h      |  121 +-
- include/tcg/tcg-opc.h           |  11 +-
+ include/qemu/int128.h          |   20 +
- include/tcg/tcg.h               |  72 +++--
+ target/ppc/int_helper.c        |   23 +-
- tcg/aarch64/tcg-target-constr.h |  31 ++
+ tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
- tcg/aarch64/tcg-target.h        |   1 -
+ tests/unit/test-div128.c       |  197 +++
- tcg/arm/tcg-target-constr.h     |  30 ++
+ util/host-utils.c              |  147 ++-
- tcg/i386/tcg-target-constr.h    |  55 ++++
+ tests/unit/meson.build         |    1 +
- tcg/i386/tcg-target.h           |   1 -
+files changed, 2053 insertions(+), 1187 deletions(-)
- tcg/mips/tcg-target-constr.h    |  31 ++
+ create mode 100644 tests/unit/test-div128.c
  tcg/ppc/tcg-target-constr.h     |  37 +++
  tcg/ppc/tcg-target.h            |   1 -
  tcg/riscv/tcg-target-constr.h   |  25 ++
  tcg/s390/tcg-target-constr.h    |  24 ++
  tcg/sparc/tcg-target-constr.h   |  27 ++
  tcg/tci/tcg-target-constr.h     |  28 ++
  accel/tcg/plugin-gen.c          |  49 ++-
  tcg/optimize.c                  | 254 ++++++++-------
  tcg/tcg-op-gvec.c               | 160 +++++-----
  tcg/tcg-op-vec.c                |  48 +--
  tcg/tcg-op.c                    | 227 +++++++------
  tcg/tcg.c                       | 549 +++++++++++++++++++++++---------
  tcg/tci.c                       |   4 +-
  tcg/aarch64/tcg-target.c.inc    | 134 +++-----
  tcg/arm/tcg-target.c.inc        | 123 +++----
  tcg/i386/tcg-target.c.inc       | 336 +++++++++----------
  tcg/mips/tcg-target.c.inc       | 118 +++----
  tcg/ppc/tcg-target.c.inc        | 254 +++++++--------
  tcg/riscv/tcg-target.c.inc      | 100 ++----
  tcg/s390/tcg-target.c.inc       | 143 ++++-----
  tcg/sparc/tcg-target.c.inc      |  97 ++----
  tcg/tcg-constr.c.inc            | 108 +++++++
  tcg/tci/tcg-target.c.inc        | 369 ++++++++-------------
 files changed, 1893 insertions(+), 1634 deletions(-)
  create mode 100644 tcg/aarch64/tcg-target-constr.h
  create mode 100644 tcg/arm/tcg-target-constr.h
  create mode 100644 tcg/i386/tcg-target-constr.h
  create mode 100644 tcg/mips/tcg-target-constr.h
  create mode 100644 tcg/ppc/tcg-target-constr.h
  create mode 100644 tcg/riscv/tcg-target-constr.h
  create mode 100644 tcg/s390/tcg-target-constr.h
  create mode 100644 tcg/sparc/tcg-target-constr.h
  create mode 100644 tcg/tci/tcg-target-constr.h
  create mode 100644 tcg/tcg-constr.c.inc
---
-.25.1

-[PATCH 24/43] tcg: Use tcg_constant_{i32,i64} with tcg int expanders
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
+Addition of not and xor on 128-bit integers.
+Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
+Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
+Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
+[rth: Split out logical operations.]
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op.h |  13 +--
+ include/qemu/int128.h | 20 ++++++++++++++++++++
- tcg/tcg-op.c         | 227 ++++++++++++++++++++-----------------------
+file changed, 20 insertions(+)
 files changed, 109 insertions(+), 131 deletions(-)
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/include/qemu/int128.h
-+++ b/include/tcg/tcg-op.h
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar);
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
+     return a;
  /* 32 bit ops */
 +void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
  void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
  void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
  void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
      }
  }
--static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
++static inline Int128 int128_not(Int128 a)
 -{
 -    tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
 -}
 -
  static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
                                      tcg_target_long offset)
  {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
  /* 64 bit ops */
 +void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
  void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
  void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
  void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
      }
  }
 -static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
 -{
 -    tcg_gen_op2i_i64(INDEX_op_movi_i64, ret, arg);
 -}
 -
  static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
                                      tcg_target_long offset)
  {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
  void tcg_gen_discard_i64(TCGv_i64 arg);
  void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
 -void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
  void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
  void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
  void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op.c
 +++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar mb_type)
  /* 32 bit ops */
 +void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
 +{
-+    tcg_gen_mov_i32(ret, tcg_constant_i32(arg));
++    return ~a;
 +}
 +
- void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
+ static inline Int128 int128_and(Int128 a, Int128 b)
  {
-     /* some cases can be optimized here */
+     return a & b;
-     if (arg2 == 0) {
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
-         tcg_gen_mov_i32(ret, arg1);
+     return a | b;
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_add_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_add_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
++static inline Int128 int128_xor(Int128 a, Int128 b)
          /* Don't recurse with tcg_gen_neg_i32.  */
          tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg2);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg1);
 -        tcg_gen_sub_i32(ret, t0, arg2);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_sub_i32(ret, tcg_constant_i32(arg1), arg2);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_sub_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_sub_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
  void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
  {
 -    TCGv_i32 t0;
      /* Some cases can be optimized here.  */
      switch (arg2) {
      case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
          }
          break;
      }
 -    t0 = tcg_const_i32(arg2);
 -    tcg_gen_and_i32(ret, arg1, t0);
 -    tcg_temp_free_i32(t0);
 +
 +    tcg_gen_and_i32(ret, arg1, tcg_constant_i32(arg2));
  }
  void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_or_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_or_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
          /* Don't recurse with tcg_gen_not_i32.  */
          tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_xor_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_xor_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_shl_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_shl_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_shr_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_shr_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_sar_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_sar_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *l)
      if (cond == TCG_COND_ALWAYS) {
          tcg_gen_br(l);
      } else if (cond != TCG_COND_NEVER) {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_brcond_i32(cond, arg1, t0, l);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_brcond_i32(cond, arg1, tcg_constant_i32(arg2), l);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
  void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
                            TCGv_i32 arg1, int32_t arg2)
  {
 -    TCGv_i32 t0 = tcg_const_i32(arg2);
 -    tcg_gen_setcond_i32(cond, ret, arg1, t0);
 -    tcg_temp_free_i32(t0);
 +    tcg_gen_setcond_i32(cond, ret, arg1, tcg_constant_i32(arg2));
  }
  void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      } else if (is_power_of_2(arg2)) {
          tcg_gen_shli_i32(ret, arg1, ctz32(arg2));
      } else {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_mul_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_mul_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
  void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
  {
 -    TCGv_i32 t = tcg_const_i32(arg2);
 -    tcg_gen_clz_i32(ret, arg1, t);
 -    tcg_temp_free_i32(t);
 +    tcg_gen_clz_i32(ret, arg1, tcg_constant_i32(arg2));
  }
  void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
              tcg_gen_clzi_i32(t, t, 32);
              tcg_gen_xori_i32(t, t, 31);
          }
 -        z = tcg_const_i32(0);
 +        z = tcg_constant_i32(0);
          tcg_gen_movcond_i32(TCG_COND_EQ, ret, arg1, z, arg2, t);
          tcg_temp_free_i32(t);
 -        tcg_temp_free_i32(z);
      } else {
          gen_helper_ctz_i32(ret, arg1, arg2);
      }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
          tcg_gen_ctpop_i32(ret, t);
          tcg_temp_free_i32(t);
      } else {
 -        TCGv_i32 t = tcg_const_i32(arg2);
 -        tcg_gen_ctz_i32(ret, arg1, t);
 -        tcg_temp_free_i32(t);
 +        tcg_gen_ctz_i32(ret, arg1, tcg_constant_i32(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i32(ret, arg1);
      } else if (TCG_TARGET_HAS_rot_i32) {
 -        TCGv_i32 t0 = tcg_const_i32(arg2);
 -        tcg_gen_rotl_i32(ret, arg1, t0);
 -        tcg_temp_free_i32(t0);
 +        tcg_gen_rotl_i32(ret, arg1, tcg_constant_i32(arg2));
      } else {
          TCGv_i32 t0, t1;
          t0 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
          tcg_gen_andi_i32(ret, arg, (1u << len) - 1);
      } else if (TCG_TARGET_HAS_deposit_i32
                 && TCG_TARGET_deposit_i32_valid(ofs, len)) {
 -        TCGv_i32 zero = tcg_const_i32(0);
 +        TCGv_i32 zero = tcg_constant_i32(0);
          tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, zero, arg, ofs, len);
 -        tcg_temp_free_i32(zero);
      } else {
          /* To help two-operand hosts we prefer to zero-extend first,
             which allows ARG to stay live.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
      } else {
          TCGv_i32 t0 = tcg_temp_new_i32();
          TCGv_i32 t1 = tcg_temp_new_i32();
 -        TCGv_i32 t2 = tcg_const_i32(0x00ff00ff);
 +        TCGv_i32 t2 = tcg_constant_i32(0x00ff00ff);
                                          /* arg = abcd */
          tcg_gen_shri_i32(t0, arg, 8);   /*  t0 = .abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
          tcg_temp_free_i32(t0);
          tcg_temp_free_i32(t1);
 -        tcg_temp_free_i32(t2);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_discard_i64(TCGv_i64 arg)
  void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
  {
 -    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
 -    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
 +    TCGTemp *ts = tcgv_i64_temp(arg);
 +
 +    /* Canonicalize TCGv_i64 TEMP_CONST into TCGv_i32 TEMP_CONST. */
 +    if (ts->kind == TEMP_CONST) {
 +        tcg_gen_movi_i64(ret, ts->val);
 +    } else {
 +        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
 +        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
 +    }
  }
  void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
      tcg_temp_free_i64(t0);
      tcg_temp_free_i32(t1);
  }
 +
 +#else
 +
 +void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
 +{
-+    tcg_gen_mov_i64(ret, tcg_constant_i64(arg));
++    return a ^ b;
 +}
 +
- #endif /* TCG_TARGET_REG_SIZE == 32 */
+ static inline Int128 int128_rshift(Int128 a, int n)
+ {
- void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+     return a >> n;
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
-     /* some cases can be optimized here */
+     return int128_make128(a, (a < 0) ? -1 : 0);
      if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
 +    } else if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_add_i64(ret, arg1, tcg_constant_i64(arg2));
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_add_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
 +                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
      }
  }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
++static inline Int128 int128_not(Int128 a)
-     if (arg1 == 0 && TCG_TARGET_HAS_neg_i64) {
++{
-         /* Don't recurse with tcg_gen_neg_i64.  */
++    return int128_make128(~a.lo, ~a.hi);
-         tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg2);
++}
-+    } else if (TCG_TARGET_REG_BITS == 64) {
++
-+        tcg_gen_sub_i64(ret, tcg_constant_i64(arg1), arg2);
+ static inline Int128 int128_and(Int128 a, Int128 b)
-     } else {
+ {
--        TCGv_i64 t0 = tcg_const_i64(arg1);
+     return int128_make128(a.lo & b.lo, a.hi & b.hi);
--        tcg_gen_sub_i64(ret, t0, arg2);
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
--        tcg_temp_free_i64(t0);
+     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 +        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
 +                         tcg_constant_i32(arg1), tcg_constant_i32(arg1 >> 32),
 +                         TCGV_LOW(arg2), TCGV_HIGH(arg2));
      }
  }
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
++static inline Int128 int128_xor(Int128 a, Int128 b)
-     /* some cases can be optimized here */
++{
-     if (arg2 == 0) {
++    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
-         tcg_gen_mov_i64(ret, arg1);
++}
-+    } else if (TCG_TARGET_REG_BITS == 64) {
++
-+        tcg_gen_sub_i64(ret, arg1, tcg_constant_i64(arg2));
+ static inline Int128 int128_rshift(Int128 a, int n)
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_sub_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
 +                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
      }
  }
  void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
  {
--    TCGv_i64 t0;
+     int64_t h;
 -
      if (TCG_TARGET_REG_BITS == 32) {
          tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
          tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
          }
          break;
      }
 -    t0 = tcg_const_i64(arg2);
 -    tcg_gen_and_i64(ret, arg1, t0);
 -    tcg_temp_free_i64(t0);
 +
 +    tcg_gen_and_i64(ret, arg1, tcg_constant_i64(arg2));
  }
  void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_or_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_or_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
          /* Don't recurse with tcg_gen_not_i64.  */
          tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_xor_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_xor_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_shl_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_shl_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_shr_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_shr_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      } else if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_sar_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_sar_i64(ret, arg1, tcg_constant_i64(arg2));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *l)
  void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *l)
  {
 -    if (cond == TCG_COND_ALWAYS) {
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_brcond_i64(cond, arg1, tcg_constant_i64(arg2), l);
 +    } else if (cond == TCG_COND_ALWAYS) {
          tcg_gen_br(l);
      } else if (cond != TCG_COND_NEVER) {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_brcond_i64(cond, arg1, t0, l);
 -        tcg_temp_free_i64(t0);
 +        l->refs++;
 +        tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
 +                          TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                          tcg_constant_i32(arg2),
 +                          tcg_constant_i32(arg2 >> 32),
 +                          cond, label_arg(l));
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
  void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
                            TCGv_i64 arg1, int64_t arg2)
  {
 -    TCGv_i64 t0 = tcg_const_i64(arg2);
 -    tcg_gen_setcond_i64(cond, ret, arg1, t0);
 -    tcg_temp_free_i64(t0);
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        tcg_gen_setcond_i64(cond, ret, arg1, tcg_constant_i64(arg2));
 +    } else if (cond == TCG_COND_ALWAYS) {
 +        tcg_gen_movi_i64(ret, 1);
 +    } else if (cond == TCG_COND_NEVER) {
 +        tcg_gen_movi_i64(ret, 0);
 +    } else {
 +        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
 +                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
 +                         tcg_constant_i32(arg2),
 +                         tcg_constant_i32(arg2 >> 32), cond);
 +        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
 +    }
  }
  void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
      } else {
          TCGv_i64 t0 = tcg_temp_new_i64();
          TCGv_i64 t1 = tcg_temp_new_i64();
 -        TCGv_i64 t2 = tcg_const_i64(0x00ff00ff);
 +        TCGv_i64 t2 = tcg_constant_i64(0x00ff00ff);
                                          /* arg = ....abcd */
          tcg_gen_shri_i64(t0, arg, 8);   /*  t0 = .....abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
          tcg_temp_free_i64(t0);
          tcg_temp_free_i64(t1);
 -        tcg_temp_free_i64(t2);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
      if (TCG_TARGET_REG_BITS == 32
          && TCG_TARGET_HAS_clz_i32
          && arg2 <= 0xffffffffu) {
 -        TCGv_i32 t = tcg_const_i32((uint32_t)arg2 - 32);
 -        tcg_gen_clz_i32(t, TCGV_LOW(arg1), t);
 +        TCGv_i32 t = tcg_temp_new_i32();
 +        tcg_gen_clzi_i32(t, TCGV_LOW(arg1), arg2 - 32);
          tcg_gen_addi_i32(t, t, 32);
          tcg_gen_clz_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), t);
          tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
          tcg_temp_free_i32(t);
      } else {
 -        TCGv_i64 t = tcg_const_i64(arg2);
 -        tcg_gen_clz_i64(ret, arg1, t);
 -        tcg_temp_free_i64(t);
 +        TCGv_i64 t0 = tcg_const_i64(arg2);
 +        tcg_gen_clz_i64(ret, arg1, t0);
 +        tcg_temp_free_i64(t0);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
              tcg_gen_clzi_i64(t, t, 64);
              tcg_gen_xori_i64(t, t, 63);
          }
 -        z = tcg_const_i64(0);
 +        z = tcg_constant_i64(0);
          tcg_gen_movcond_i64(TCG_COND_EQ, ret, arg1, z, arg2, t);
          tcg_temp_free_i64(t);
          tcg_temp_free_i64(z);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
      if (TCG_TARGET_REG_BITS == 32
          && TCG_TARGET_HAS_ctz_i32
          && arg2 <= 0xffffffffu) {
 -        TCGv_i32 t32 = tcg_const_i32((uint32_t)arg2 - 32);
 -        tcg_gen_ctz_i32(t32, TCGV_HIGH(arg1), t32);
 +        TCGv_i32 t32 = tcg_temp_new_i32();
 +        tcg_gen_ctzi_i32(t32, TCGV_HIGH(arg1), arg2 - 32);
          tcg_gen_addi_i32(t32, t32, 32);
          tcg_gen_ctz_i32(TCGV_LOW(ret), TCGV_LOW(arg1), t32);
          tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
          tcg_gen_ctpop_i64(ret, t);
          tcg_temp_free_i64(t);
      } else {
 -        TCGv_i64 t64 = tcg_const_i64(arg2);
 -        tcg_gen_ctz_i64(ret, arg1, t64);
 -        tcg_temp_free_i64(t64);
 +        TCGv_i64 t0 = tcg_const_i64(arg2);
 +        tcg_gen_ctz_i64(ret, arg1, t0);
 +        tcg_temp_free_i64(t0);
      }
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
      if (arg2 == 0) {
          tcg_gen_mov_i64(ret, arg1);
      } else if (TCG_TARGET_HAS_rot_i64) {
 -        TCGv_i64 t0 = tcg_const_i64(arg2);
 -        tcg_gen_rotl_i64(ret, arg1, t0);
 -        tcg_temp_free_i64(t0);
 +        tcg_gen_rotl_i64(ret, arg1, tcg_constant_i64(arg2));
      } else {
          TCGv_i64 t0, t1;
          t0 = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
          tcg_gen_andi_i64(ret, arg, (1ull << len) - 1);
      } else if (TCG_TARGET_HAS_deposit_i64
                 && TCG_TARGET_deposit_i64_valid(ofs, len)) {
 -        TCGv_i64 zero = tcg_const_i64(0);
 +        TCGv_i64 zero = tcg_constant_i64(0);
          tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, zero, arg, ofs, len);
 -        tcg_temp_free_i64(zero);
      } else {
          if (TCG_TARGET_REG_BITS == 32) {
              if (ofs >= 32) {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
  #ifdef CONFIG_SOFTMMU
          {
 -            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
 -            gen(retv, cpu_env, addr, cmpv, newv, oi);
 -            tcg_temp_free_i32(oi);
 +            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
 +            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
          }
  #else
          gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
  #ifdef CONFIG_SOFTMMU
          {
 -            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
 -            gen(retv, cpu_env, addr, cmpv, newv, oi);
 -            tcg_temp_free_i32(oi);
 +            TCGMemOpIdx oi = make_memop_idx(memop, idx);
 +            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
          }
  #else
          gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
  #ifdef CONFIG_SOFTMMU
      {
 -        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
 -        gen(ret, cpu_env, addr, val, oi);
 -        tcg_temp_free_i32(oi);
 +        TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
 +        gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
      }
  #else
      gen(ret, cpu_env, addr, val);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
  #ifdef CONFIG_SOFTMMU
          {
 -            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
 -            gen(ret, cpu_env, addr, val, oi);
 -            tcg_temp_free_i32(oi);
 +            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
 +            gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
          }
  #else
          gen(ret, cpu_env, addr, val);
 --
 .25.1

-New patch
+[PULL 02/56] host-utils: move checks out of divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |  5 +++--
  include/qemu/host-utils.h | 34 ++++++++++++---------------------
  target/ppc/int_helper.c   | 14 +++++++++-----
  util/host-utils.c         | 40 ++++++++++++++++++---------------------
 files changed, 42 insertions(+), 51 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
 --
 .25.1

-New patch
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
+From: Luis Pires <luis.pires@eldorado.org.br>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat-macros.h | 82 ----------------------------------
  include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 files changed, 81 insertions(+), 82 deletions(-)
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat-macros.h
 +++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
   * so some portions are provided under:
   *  the SoftFloat-2a license
   *  the BSD license
 - *  GPL-v2-or-later
   *
   * Any future contributions to this file after December 1st 2014 will be
   * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
   * THE POSSIBILITY OF SUCH DAMAGE.
   */
 -/* Portions of this work are licensed under the terms of the GNU GPL,
 - * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-New patch
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 These will be used to implement new decimal floating point
 instructions from Power ISA 3.1.
 The remainder is now returned directly by divu128/divs128,
 freeing up phigh to receive the high 64 bits of the quotient.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |   6 +-
  include/qemu/host-utils.h |  20 ++++--
  target/ppc/int_helper.c   |   9 +--
  util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 files changed, 108 insertions(+), 60 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
      if (clk->period == 0) {
          return 0;
      }
 -    /*
 -     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-[PATCH 34/43] tcg: Add tcg-constr.c.inc
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
-Begin conversion of constraints to pre-validated, read-only entities.
+From: Luis Pires <luis.pires@eldorado.org.br>
-To begin, create a simple method by which sets of TCGTargetOpDef
-structures may be declared and used.  This simplifies each host's
+Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
-tcg_target_op_def function and ensures that we have a collected
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
-set of constraints.
+Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg-constr.c.inc | 108 +++++++++++++++++++++++++++++++++++++++++++
+ tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
-file changed, 108 insertions(+)
+ tests/unit/meson.build   |   1 +
- create mode 100644 tcg/tcg-constr.c.inc
+files changed, 198 insertions(+)
+ create mode 100644 tests/unit/test-div128.c
-diff --git a/tcg/tcg-constr.c.inc b/tcg/tcg-constr.c.inc
 diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tcg/tcg-constr.c.inc
++++ b/tests/unit/test-div128.c
 @@ -XXX,XX +XXX,XX @@
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
-+ * TCG backend data: operand constaints.
++ * Test 128-bit division functions
-+ * Copyright (c) 2020 Linaro
++ *
 + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
-+/*
++#include "qemu/osdep.h"
-+ * Define structures for each set of constraints.
++#include "qemu/host-utils.h"
-+ */
++
-+
++typedef struct {
-+#define C_PFX1(P, A)                 P##A
++    uint64_t high;
-+#define C_PFX2(P, A, B)              P##A##_##B
++    uint64_t low;
-+#define C_PFX3(P, A, B, C)           P##A##_##B##_##C
++    uint64_t rhigh;
-+#define C_PFX4(P, A, B, C, D)        P##A##_##B##_##C##_##D
++    uint64_t rlow;
-+#define C_PFX5(P, A, B, C, D, E)     P##A##_##B##_##C##_##D##_##E
++    uint64_t divisor;
-+#define C_PFX6(P, A, B, C, D, E, F)  P##A##_##B##_##C##_##D##_##E##_##F
++    uint64_t remainder;
-+
++} test_data_unsigned;
-+#define C_O0_I1(I1) \
++
-+    static const TCGTargetOpDef C_PFX1(c_o0_i1_, I1) \
++typedef struct {
-+      = { .args_ct_str = { #I1 } };
++    int64_t high;
-+
++    uint64_t low;
-+#define C_O0_I2(I1, I2) \
++    int64_t rhigh;
-+    static const TCGTargetOpDef C_PFX2(c_o0_i2_, I1, I2) \
++    uint64_t rlow;
-+      = { .args_ct_str = { #I1, #I2 } };
++    int64_t divisor;
-+
++    int64_t remainder;
-+#define C_O0_I3(I1, I2, I3) \
++} test_data_signed;
-+    static const TCGTargetOpDef C_PFX3(c_o0_i3_, I1, I2, I3) \
++
-+      = { .args_ct_str = { #I1, #I2, #I3 } };
++static const test_data_unsigned test_table_unsigned[] = {
-+
++    /* Dividend fits in 64 bits */
-+#define C_O0_I4(I1, I2, I3, I4) \
++    { 0x0000000000000000ULL, 0x0000000000000000ULL,
-+    static const TCGTargetOpDef C_PFX4(c_o0_i4_, I1, I2, I3, I4) \
++      0x0000000000000000ULL, 0x0000000000000000ULL,
-+      = { .args_ct_str = { #I1, #I2, #I3, #I4 } };
++      0x0000000000000001ULL, 0x0000000000000000ULL},
-+
++    { 0x0000000000000000ULL, 0x0000000000000001ULL,
-+#define C_O1_I1(O1, I1) \
++      0x0000000000000000ULL, 0x0000000000000001ULL,
-+    static const TCGTargetOpDef C_PFX2(c_o1_i1_, O1, I1) \
++      0x0000000000000001ULL, 0x0000000000000000ULL},
-+      = { .args_ct_str = { #O1, #I1 } };
++    { 0x0000000000000000ULL, 0x0000000000000003ULL,
-+
++      0x0000000000000000ULL, 0x0000000000000001ULL,
-+#define C_O1_I2(O1, I1, I2) \
++      0x0000000000000002ULL, 0x0000000000000001ULL},
-+    static const TCGTargetOpDef C_PFX3(c_o1_i2_, O1, I1, I2) \
++    { 0x0000000000000000ULL, 0x8000000000000000ULL,
-+      = { .args_ct_str = { #O1, #I1, #I2 } };
++      0x0000000000000000ULL, 0x8000000000000000ULL,
-+
++      0x0000000000000001ULL, 0x0000000000000000ULL},
-+#define C_O1_I3(O1, I1, I2, I3) \
++    { 0x0000000000000000ULL, 0xa000000000000000ULL,
-+    static const TCGTargetOpDef C_PFX4(c_o1_i3_, O1, I1, I2, I3) \
++      0x0000000000000000ULL, 0x0000000000000002ULL,
-+      = { .args_ct_str = { #O1, #I1, #I2, #I3 } };
++      0x4000000000000000ULL, 0x2000000000000000ULL},
-+
++    { 0x0000000000000000ULL, 0x8000000000000000ULL,
-+#define C_O1_I4(O1, I1, I2, I3, I4) \
++      0x0000000000000000ULL, 0x0000000000000001ULL,
-+    static const TCGTargetOpDef C_PFX5(c_o1_i4_, O1, I1, I2, I3, I4) \
++      0x8000000000000000ULL, 0x0000000000000000ULL},
-+      = { .args_ct_str = { #O1, #I1, #I2, #I3, #I4 } };
++
-+
++    /* Dividend > 64 bits, with MSB 0 */
-+#define C_N1_I2(O1, I1, I2) \
++    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
-+    static const TCGTargetOpDef C_PFX3(c_n1_i2_, O1, I1, I2) \
++      0x123456789abcdefeULL, 0xefedcba987654321ULL,
-+      = { .args_ct_str = { "&" #O1, #I1, #I2 } };
++      0x0000000000000001ULL, 0x0000000000000000ULL},
-+
++    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
-+#define C_O2_I1(O1, O2, I1) \
++      0x0000000000000001ULL, 0x000000000000000dULL,
-+    static const TCGTargetOpDef C_PFX3(c_o2_i1_, O1, O2, I1) \
++      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
-+      = { .args_ct_str = { #O1, #O2, #I1 } };
++    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
-+
++      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
-+#define C_O2_I2(O1, O2, I1, I2) \
++      0x0000000000000010ULL, 0x0000000000000001ULL},
-+    static const TCGTargetOpDef C_PFX4(c_o2_i2_, O1, O2, I1, I2) \
++
-+      = { .args_ct_str = { #O1, #O2, #I1, #I2 } };
++    /* Dividend > 64 bits, with MSB 1 */
-+
++    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+#define C_O2_I3(O1, O2, I1, I2, I3) \
++      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+    static const TCGTargetOpDef C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3) \
++      0x0000000000000001ULL, 0x0000000000000000ULL},
-+      = { .args_ct_str = { #O1, #O2, #I1, #I2, #I3 } };
++    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+
++      0x0000000000000001ULL, 0x0000000000000000ULL,
-+#define C_O2_I4(O1, O2, I1, I2, I3, I4) \
++      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
-+    static const TCGTargetOpDef C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4) \
++    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+      = { .args_ct_str = { #O1, #O2, #I1, #I2, #I3, #I4 } };
++      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
-+
++      0x0000000000000010ULL, 0x000000000000000fULL},
-+#include "tcg-target-constr.h"
++    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+
++      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
-+
++      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
-+/*
++
-+ * Redefine the macros so that they now reference those structures.
++    /**
-+ * These values should be returned from tcg_target_op_def().
++     * Divisor == 64 bits, with MSB 1
-+ */
++     * and high 64 bits of dividend >= divisor
-+
++     * (for testing normalization)
-+#undef C_O0_I1
++     */
-+#undef C_O0_I2
++    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+#undef C_O0_I3
++      0x0000000000000001ULL, 0x0000000000000000ULL,
-+#undef C_O0_I4
++      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
-+#undef C_O1_I1
++    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
-+#undef C_O1_I2
++      0x0000000000000001ULL, 0xfddbb9977553310aULL,
-+#undef C_O1_I3
++      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
-+#undef C_O1_I4
++
-+#undef C_N1_I2
++    /* Dividend > 64 bits, divisor almost as big */
-+#undef C_O2_I1
++    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
-+#undef C_O2_I2
++      0x0000000000000000ULL, 0x000000000000000fULL,
-+#undef C_O2_I3
++      0x123456789abcdefeULL, 0x123456789abcde1fULL},
-+#undef C_O2_I4
++};
 +
-+#define C_O0_I1(I1)                     &C_PFX1(c_o0_i1_, I1)
++static const test_data_signed test_table_signed[] = {
-+#define C_O0_I2(I1, I2)                 &C_PFX2(c_o0_i2_, I1, I2)
++    /* Positive dividend, positive/negative divisors */
-+#define C_O0_I3(I1, I2, I3)             &C_PFX3(c_o0_i3_, I1, I2, I3)
++    { 0x0000000000000000LL, 0x0000000000bc614eULL,
-+#define C_O0_I4(I1, I2, I3, I4)         &C_PFX4(c_o0_i4_, I1, I2, I3, I4)
++      0x0000000000000000LL, 0x0000000000bc614eULL,
-+
++      0x0000000000000001LL, 0x0000000000000000LL},
-+#define C_O1_I1(O1, I1)                 &C_PFX2(c_o1_i1_, O1, I1)
++    { 0x0000000000000000LL, 0x0000000000bc614eULL,
-+#define C_O1_I2(O1, I1, I2)             &C_PFX3(c_o1_i2_, O1, I1, I2)
++      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
-+#define C_O1_I3(O1, I1, I2, I3)         &C_PFX4(c_o1_i3_, O1, I1, I2, I3)
++      0xffffffffffffffffLL, 0x0000000000000000LL},
-+#define C_O1_I4(O1, I1, I2, I3, I4)     &C_PFX5(c_o1_i4_, O1, I1, I2, I3, I4)
++    { 0x0000000000000000LL, 0x0000000000bc614eULL,
-+
++      0x0000000000000000LL, 0x00000000005e30a7ULL,
-+#define C_N1_I2(O1, I1, I2)             &C_PFX3(c_n1_i2_, O1, I1, I2)
++      0x0000000000000002LL, 0x0000000000000000LL},
-+
++    { 0x0000000000000000LL, 0x0000000000bc614eULL,
-+#define C_O2_I1(O1, O2, I1)             &C_PFX3(c_o2_i1_, O1, O2, I1)
++      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
-+#define C_O2_I2(O1, O2, I1, I2)         &C_PFX4(c_o2_i2_, O1, O2, I1, I2)
++      0xfffffffffffffffeLL, 0x0000000000000000LL},
-+#define C_O2_I3(O1, O2, I1, I2, I3)     &C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3)
++    { 0x0000000000000000LL, 0x0000000000bc614eULL,
-+#define C_O2_I4(O1, O2, I1, I2, I3, I4) \
++      0x0000000000000000LL, 0x0000000000178c29ULL,
-+    &C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4)
++      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
 +    int i;
 +    uint64_t rem;
 +    test_data_unsigned tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +static void test_divs128(void)
 +{
 +    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/host-utils/test_divu128", test_divu128);
 +    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/meson.build
 +++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
    # all code tested by test-x86-cpuid is inside topology.h
    'test-x86-cpuid': [],
    'test-cutils': [],
 +  'test-div128': [],
    'test-shift128': [],
    'test-mul64': [],
    # all code tested by test-int128 is inside int128.h
 --
 .25.1

-[PATCH 17/43] tcg: Expand TempOptInfo to 64-bits
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
-This propagates the extended value of TCGTemp.val that we did before.
+Prepare for tracking different masks by renaming this one.
 In addition, it will be required for vector constants.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 40 +++++++++++++++++++++-------------------
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
-file changed, 21 insertions(+), 19 deletions(-)
+file changed, 72 insertions(+), 70 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
 @@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     bool is_const;
      TCGTemp *prev_copy;
      TCGTemp *next_copy;
--    tcg_target_ulong val;
+     uint64_t val;
--    tcg_target_ulong mask;
+-    uint64_t mask;
-+    uint64_t val;
++    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 +    uint64_t mask;
  } TempOptInfo;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
-@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
+@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
-     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
+     ti->next_copy = ts;
      ti->prev_copy = ts;
      ti->is_const = false;
 -    ti->mask = -1;
 +    ti->z_mask = -1;
  }
--static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
+ static void reset_temp(TCGArg arg)
-+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
- {
+     if (ts->kind == TEMP_CONST) {
-     const TCGOpDef *def;
+         ti->is_const = true;
-     TCGOpcode new_op;
+         ti->val = ts->val;
--    tcg_target_ulong mask;
+-        ti->mask = ts->val;
-+    uint64_t mask;
++        ti->z_mask = ts->val;
-     TempOptInfo *di = arg_info(dst);
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+             /* High bits of a 32-bit quantity are garbage.  */
-     def = &tcg_op_defs[op->opc];
+-            ti->mask |= ~0xffffffffull;
 +            ti->z_mask |= ~0xffffffffull;
          }
      } else {
          ti->is_const = false;
 -        ti->mask = -1;
 +        ti->z_mask = -1;
      }
  }
 @@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
--    tcg_target_ulong mask;
+-    uint64_t mask;
-+    uint64_t mask;
++    uint64_t z_mask;
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
 @@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     op->args[0] = dst;
+     op->args[1] = src;
+-    mask = si->mask;
++    z_mask = si->z_mask;
+     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+         /* High bits of the destination are now garbage.  */
+-        mask |= ~0xffffffffull;
++        z_mask |= ~0xffffffffull;
      }
- }
+-    di->mask = mask;
++    di->z_mask = z_mask;
--static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
-+static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+     if (src_ts->type == dst_ts->type) {
- {
+         TempOptInfo *ni = ts_info(si->next_copy);
-     uint64_t l64, h64;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
      }
- }
--static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
-+static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
- {
-     const TCGOpDef *def = &tcg_op_defs[op];
--    TCGArg res = do_constant_folding_2(op, x, y);
-+    uint64_t res = do_constant_folding_2(op, x, y);
-     if (!(def->flags & TCG_OPF_64BIT)) {
-         res = (int32_t)res;
-     }
-@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
- static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                        TCGArg y, TCGCond c)
- {
--    tcg_target_ulong xv = arg_info(x)->val;
--    tcg_target_ulong yv = arg_info(y)->val;
-+    uint64_t xv = arg_info(x)->val;
-+    uint64_t yv = arg_info(y)->val;
-+
-     if (arg_is_const(x) && arg_is_const(y)) {
-         const TCGOpDef *def = &tcg_op_defs[op];
-         tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
--        tcg_target_ulong mask, partmask, affected;
+-        uint64_t mask, partmask, affected, tmp;
-+        uint64_t mask, partmask, affected, tmp;
++        uint64_t z_mask, partmask, affected, tmp;
-         int nb_oargs, nb_iargs, i;
+         int nb_oargs, nb_iargs;
 -        TCGArg tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def = &tcg_op_defs[opc];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         /* Simplify using known-zero bits. Currently only ops with a single
-         CASE_OP_32_64(extract2):
+            output argument is supported. */
-             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-        mask = -1;
--                TCGArg v1 = arg_info(op->args[1])->val;
++        z_mask = -1;
--                TCGArg v2 = arg_info(op->args[2])->val;
+         affected = -1;
-+                uint64_t v1 = arg_info(op->args[1])->val;
+         switch (opc) {
-+                uint64_t v2 = arg_info(op->args[2])->val;
+         CASE_OP_32_64(ext8s):
-+                int shr = op->args[3];
+-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                 if (opc == INDEX_op_extract2_i64) {
+                 break;
--                    tmp = (v1 >> op->args[3]) | (v2 << (64 - op->args[3]));
+             }
-+                    tmp = (v1 >> shr) | (v2 << (64 - shr));
+             QEMU_FALLTHROUGH;
-                 } else {
+         CASE_OP_32_64(ext8u):
--                    tmp = (int32_t)(((uint32_t)v1 >> op->args[3]) |
+-            mask = 0xff;
--                                    ((uint32_t)v2 << (32 - op->args[3])));
++            z_mask = 0xff;
-+                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+             goto and_const;
-+                                    ((uint32_t)v2 << (32 - shr)));
+         CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          CASE_OP_32_64(ext16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              goto and_const;
          case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          case INDEX_op_ext32u_i64:
 -            mask = 0xffffffffU;
 +            z_mask = 0xffffffffU;
              goto and_const;
          CASE_OP_32_64(and):
 -            mask = arg_info(op->args[2])->mask;
 +            z_mask = arg_info(op->args[2])->z_mask;
              if (arg_is_const(op->args[2])) {
          and_const:
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
 -            mask = arg_info(op->args[1])->mask & mask;
 +            z_mask = arg_info(op->args[1])->z_mask & z_mask;
              break;
          case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          case INDEX_op_extu_i32_i64:
              /* We do not compute affected as it is a size changing op.  */
 -            mask = (uint32_t)arg_info(op->args[1])->mask;
 +            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
              break;
          CASE_OP_32_64(andc):
              /* Known-zeros does not imply known-ones.  Therefore unless
                 op->args[2] is constant, we can't infer anything from it.  */
              if (arg_is_const(op->args[2])) {
 -                mask = ~arg_info(op->args[2])->mask;
 +                z_mask = ~arg_info(op->args[2])->z_mask;
                  goto and_const;
              }
              /* But we certainly know nothing outside args[1] may be set. */
 -            mask = arg_info(op->args[1])->mask;
 +            z_mask = arg_info(op->args[1])->z_mask;
              break;
          case INDEX_op_sar_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_sar_i64:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 63;
 -                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i64:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 63;
 -                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_extrl_i64_i32:
 -            mask = (uint32_t)arg_info(op->args[1])->mask;
 +            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
              break;
          case INDEX_op_extrh_i64_i32:
 -            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
 +            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
              break;
          CASE_OP_32_64(shl):
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                mask = arg_info(op->args[1])->mask << tmp;
 +                z_mask = arg_info(op->args[1])->z_mask << tmp;
              }
              break;
          CASE_OP_32_64(neg):
              /* Set to 1 all bits to the left of the rightmost.  */
 -            mask = -(arg_info(op->args[1])->mask
 -                     & -arg_info(op->args[1])->mask);
 +            z_mask = -(arg_info(op->args[1])->z_mask
 +                       & -arg_info(op->args[1])->z_mask);
              break;
          CASE_OP_32_64(deposit):
 -            mask = deposit64(arg_info(op->args[1])->mask,
 -                             op->args[3], op->args[4],
 -                             arg_info(op->args[2])->mask);
 +            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                               op->args[3], op->args[4],
 +                               arg_info(op->args[2])->z_mask);
              break;
          CASE_OP_32_64(extract):
 -            mask = extract64(arg_info(op->args[1])->mask,
 -                             op->args[2], op->args[3]);
 +            z_mask = extract64(arg_info(op->args[1])->z_mask,
 +                               op->args[2], op->args[3]);
              if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
              break;
          CASE_OP_32_64(sextract):
 -            mask = sextract64(arg_info(op->args[1])->mask,
 -                              op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 +                                op->args[2], op->args[3]);
 +            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
              break;
          CASE_OP_32_64(or):
          CASE_OP_32_64(xor):
 -            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
 +            z_mask = arg_info(op->args[1])->z_mask
 +                   | arg_info(op->args[2])->z_mask;
              break;
          case INDEX_op_clz_i32:
          case INDEX_op_ctz_i32:
 -            mask = arg_info(op->args[2])->mask | 31;
 +            z_mask = arg_info(op->args[2])->z_mask | 31;
              break;
          case INDEX_op_clz_i64:
          case INDEX_op_ctz_i64:
 -            mask = arg_info(op->args[2])->mask | 63;
 +            z_mask = arg_info(op->args[2])->z_mask | 63;
              break;
          case INDEX_op_ctpop_i32:
 -            mask = 32 | 31;
 +            z_mask = 32 | 31;
              break;
          case INDEX_op_ctpop_i64:
 -            mask = 64 | 63;
 +            z_mask = 64 | 63;
              break;
          CASE_OP_32_64(setcond):
          case INDEX_op_setcond2_i32:
 -            mask = 1;
 +            z_mask = 1;
              break;
          CASE_OP_32_64(movcond):
 -            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
 +            z_mask = arg_info(op->args[3])->z_mask
 +                   | arg_info(op->args[4])->z_mask;
              break;
          CASE_OP_32_64(ld8u):
 -            mask = 0xff;
 +            z_mask = 0xff;
              break;
          CASE_OP_32_64(ld16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              break;
          case INDEX_op_ld32u_i64:
 -            mask = 0xffffffffu;
 +            z_mask = 0xffffffffu;
              break;
          CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                  MemOp mop = get_memop(oi);
                  if (!(mop & MO_SIGN)) {
 -                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 +                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                  }
-                 tcg_opt_gen_movi(s, op, op->args[0], tmp);
+             }
-                 break;
+             break;
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-                 break;
+         CASE_OP_32_64(bswap16):
-             }
+-            mask = arg_info(op->args[1])->mask;
-             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-            if (mask <= 0xffff) {
--                tcg_target_ulong tv = arg_info(op->args[3])->val;
++            z_mask = arg_info(op->args[1])->z_mask;
--                tcg_target_ulong fv = arg_info(op->args[4])->val;
++            if (z_mask <= 0xffff) {
-+                uint64_t tv = arg_info(op->args[3])->val;
+                 op->args[2] |= TCG_BSWAP_IZ;
-+                uint64_t fv = arg_info(op->args[4])->val;
+             }
-                 TCGCond cond = op->args[5];
+-            mask = bswap16(mask);
-+
++            z_mask = bswap16(z_mask);
-                 if (fv == 1 && tv == 0) {
+             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-                     cond = tcg_invert_cond(cond);
+             case TCG_BSWAP_OZ:
-                 } else if (!(tv == 1 && fv == 0)) {
+                 break;
              case TCG_BSWAP_OS:
 -                mask = (int16_t)mask;
 +                z_mask = (int16_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(16, 48);
 +                z_mask |= MAKE_64BIT_MASK(16, 48);
                  break;
              }
              break;
          case INDEX_op_bswap32_i64:
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffffffffu) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffffffffu) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap32(mask);
 +            z_mask = bswap32(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int32_t)mask;
 +                z_mask = (int32_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(32, 32);
 +                z_mask |= MAKE_64BIT_MASK(32, 32);
                  break;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          /* 32-bit ops generate 32-bit results.  For the result is zero test
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
 -        partmask = mask;
 +        partmask = z_mask;
          if (!(def->flags & TCG_OPF_64BIT)) {
 -            mask |= ~(tcg_target_ulong)0xffffffffu;
 +            z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     vs the high word of the input.  */
              do_setcond_high:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              do_setcond_low:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[2] = op->args[3];
                  op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              /* Default case: we know nothing about operation (or were unable
                 to compute the operation result) so no propagation is done.
                 We trash everything if the operation is the end of a basic
 -               block, otherwise we only trash the output args.  "mask" is
 +               block, otherwise we only trash the output args.  "z_mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
                  memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Save the corresponding known-zero bits mask for the
                         first output argument (only one supported so far). */
                      if (i == 0) {
 -                        arg_info(op->args[i])->mask = mask;
 +                        arg_info(op->args[i])->z_mask = z_mask;
                      }
                  }
              }
 --
 .25.1

-[PATCH 21/43] tcg/optimize: Use tcg_constant_internal with constant folding
+[PULL 07/56] tcg/optimize: Split out OptContext
+Provide what will become a larger context for splitting
+the very large tcg_optimize function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 108 ++++++++++++++++++++++---------------------------
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
-file changed, 49 insertions(+), 59 deletions(-)
+file changed, 40 insertions(+), 37 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
- }
+ } TempOptInfo;
--static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
++typedef struct OptContext {
--{
++    TCGTempSet temps_used;
--    const TCGOpDef *def;
++} OptContext;
--    TCGOpcode new_op;
++
--    uint64_t mask;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
--    TempOptInfo *di = arg_info(dst);
+ {
--
+     return ts->state_ptr;
--    def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
--    if (def->flags & TCG_OPF_VECTOR) {
+ }
--        new_op = INDEX_op_dupi_vec;
--    } else if (def->flags & TCG_OPF_64BIT) {
+ /* Initialize and activate a temporary.  */
--        new_op = INDEX_op_movi_i64;
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--    } else {
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
--        new_op = INDEX_op_movi_i32;
+ {
--    }
+     size_t idx = temp_idx(ts);
--    op->opc = new_op;
+     TempOptInfo *ti;
--    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
--    op->args[0] = dst;
+-    if (test_bit(idx, temps_used->l)) {
--    op->args[1] = val;
++    if (test_bit(idx, ctx->temps_used.l)) {
--
+         return;
--    reset_temp(dst);
+     }
--    di->is_const = true;
+-    set_bit(idx, temps_used->l);
--    di->val = val;
++    set_bit(idx, ctx->temps_used.l);
--    mask = val;
--    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
+     ti = ts->state_ptr;
--        /* High bits of the destination are now garbage.  */
+     if (ti == NULL) {
--        mask |= ~0xffffffffull;
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--    }
+     }
--    di->mask = mask;
+ }
--}
--
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
- static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
++static void init_arg_info(OptContext *ctx, TCGArg arg)
  {
-     TCGTemp *dst_ts = arg_temp(dst);
+-    init_ts_info(temps_used, arg_temp(arg));
 +    init_ts_info(ctx, arg_temp(arg));
  }
  static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 @@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
-+static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
-+                             TCGOp *op, TCGArg dst, uint64_t val)
++static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-+{
+                              TCGOp *op, TCGArg dst, uint64_t val)
-+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+ {
-+    TCGType type;
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-+    TCGTemp *tv;
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
-+
-+    if (def->flags & TCG_OPF_VECTOR) {
+     /* Convert movi to mov with constant temp. */
-+        type = TCGOP_VECL(op) + TCG_TYPE_V64;
+     tv = tcg_constant_internal(type, val);
-+    } else if (def->flags & TCG_OPF_64BIT) {
+-    init_ts_info(temps_used, tv);
-+        type = TCG_TYPE_I64;
++    init_ts_info(ctx, tv);
-+    } else {
+     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
-+        type = TCG_TYPE_I32;
+ }
-+    }
-+
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    /* Convert movi to mov with constant temp. */
+ {
-+    tv = tcg_constant_internal(type, val);
+     int nb_temps, nb_globals, i;
-+    init_ts_info(temps_used, tv);
+     TCGOp *op, *op_next, *prev_mb = NULL;
-+    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+-    TCGTempSet temps_used;
-+}
++    OptContext ctx = {};
-+
- static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+     /* Array VALS has an element for each temp.
- {
+        If this temp holds a constant then its value is kept in VALS' element.
      uint64_t l64, h64;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      nb_temps = s->nb_temps;
      nb_globals = s->nb_globals;
--    bitmap_zero(temps_used.l, nb_temps);
+-    memset(&temps_used, 0, sizeof(temps_used));
 +    memset(&temps_used, 0, sizeof(temps_used));
      for (i = 0; i < nb_temps; ++i) {
          s->temps[i].state_ptr = NULL;
      }
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                 TCGTemp *ts = arg_temp(op->args[i]);
+                 if (ts) {
+-                    init_ts_info(&temps_used, ts);
++                    init_ts_info(&ctx, ts);
+                 }
+             }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&temps_used, op->args[i]);
++                init_arg_info(&ctx, op->args[i]);
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
--                tcg_opt_gen_movi(s, op, op->args[0], 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                  continue;
              }
              break;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
--            tcg_opt_gen_movi(s, op, op->args[0], 0);
+-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
-+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
--                tcg_opt_gen_movi(s, op, op->args[0], 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                  continue;
              }
              break;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
--                tcg_opt_gen_movi(s, op, op->args[0], 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                  continue;
              }
              break;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-             break;
-         CASE_OP_32_64(movi):
-         case INDEX_op_dupi_vec:
--            tcg_opt_gen_movi(s, op, op->args[0], op->args[1]);
-+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
-             break;
-         case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                 tmp = arg_info(op->args[1])->val;
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
-                 if (tmp == arg_info(op->args[2])->val) {
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
--                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
-+                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                                            arg_info(op->args[2])->val));
-                     break;
+                 break;
                  }
              } else if (args_are_copies(op->args[1], op->args[2])) {
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
--                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  } else {
                      tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                  }
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[1], op->args[2]);
              if (tmp != 2) {
                  if (tmp) {
--                    bitmap_zero(temps_used.l, nb_temps);
+-                    memset(&temps_used, 0, sizeof(temps_used));
-+                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                      op->opc = INDEX_op_br;
                      op->args[0] = op->args[3];
                  } else {
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-                 uint64_t a = ((uint64_t)ah << 32) | al;
-                 uint64_t b = ((uint64_t)bh << 32) | bl;
-                 TCGArg rl, rh;
--                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
-+                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-                 if (opc == INDEX_op_add2_i32) {
-                     a += b;
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
--                tcg_opt_gen_movi(s, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
--                tcg_opt_gen_movi(s, op2, rh, (int32_t)(a >> 32));
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
-+                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-+                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  uint32_t b = arg_info(op->args[3])->val;
                  uint64_t r = (uint64_t)a * b;
                  TCGArg rl, rh;
 -                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
 +                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
                  rl = op->args[0];
                  rh = op->args[1];
--                tcg_opt_gen_movi(s, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
--                tcg_opt_gen_movi(s, op2, rh, (int32_t)(r >> 32));
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
-+                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-+                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                  break;
              }
              goto do_default;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (tmp != 2) {
                  if (tmp) {
              do_brcond_true:
--                    bitmap_zero(temps_used.l, nb_temps);
+-                    memset(&temps_used, 0, sizeof(temps_used));
-+                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                      op->opc = INDEX_op_br;
                      op->args[0] = op->args[5];
                  } else {
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_brcond_high:
--                bitmap_zero(temps_used.l, nb_temps);
+-                memset(&temps_used, 0, sizeof(temps_used));
-+                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = INDEX_op_brcond_i32;
                  op->args[0] = op->args[1];
                  op->args[1] = op->args[3];
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      goto do_default;
                  }
              do_brcond_low:
--                bitmap_zero(temps_used.l, nb_temps);
+-                memset(&temps_used, 0, sizeof(temps_used));
-+                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = INDEX_op_brcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
--                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
              } else if ((op->args[5] == TCG_COND_LT
                          || op->args[5] == TCG_COND_GE)
                         && arg_is_const(op->args[3])
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-                block, otherwise we only trash the output args.  "mask" is
+             if (!(tcg_call_flags(op)
                    & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                  for (i = 0; i < nb_globals; i++) {
 -                    if (test_bit(i, temps_used.l)) {
 +                    if (test_bit(i, ctx.temps_used.l)) {
                          reset_ts(&s->temps[i]);
                      }
                  }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 block, otherwise we only trash the output args.  "z_mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
--                bitmap_zero(temps_used.l, nb_temps);
+-                memset(&temps_used, 0, sizeof(temps_used));
-+                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
              } else {
          do_reset_output:
                  for (i = 0; i < nb_oargs; i++) {
 --
 .25.1

-[PATCH 28/43] tcg: Remove movi and dupi opcodes
+[PULL 08/56] tcg/optimize: Remove do_default label
-These are now completely covered by mov from a
+Break the final cleanup clause out of the main switch
-TYPE_CONST temporary.
+statement.  When fully folding an opcode to mov/movi,
 use "continue" to process the next opcode, else break
 to fall into the final cleanup.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-opc.h        |  3 ---
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
- tcg/optimize.c               |  4 ----
+file changed, 94 insertions(+), 96 deletions(-)
  tcg/tcg-op-vec.c             |  1 -
  tcg/tcg.c                    | 18 +-----------------
  tcg/aarch64/tcg-target.c.inc |  3 ---
  tcg/arm/tcg-target.c.inc     |  1 -
  tcg/i386/tcg-target.c.inc    |  3 ---
  tcg/mips/tcg-target.c.inc    |  2 --
  tcg/ppc/tcg-target.c.inc     |  3 ---
  tcg/riscv/tcg-target.c.inc   |  2 --
  tcg/s390/tcg-target.c.inc    |  2 --
  tcg/sparc/tcg-target.c.inc   |  2 --
  tcg/tci/tcg-target.c.inc     |  2 --
 files changed, 1 insertion(+), 45 deletions(-)
-diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-opc.h
-+++ b/include/tcg/tcg-opc.h
-@@ -XXX,XX +XXX,XX @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
- DEF(mb, 0, 0, 1, 0)
- DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
--DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
- DEF(setcond_i32, 1, 2, 1, 0)
- DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
- /* load/store */
-@@ -XXX,XX +XXX,XX @@ DEF(ctz_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_ctz_i32))
- DEF(ctpop_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ctpop_i32))
- DEF(mov_i64, 1, 1, 0, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
--DEF(movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
- DEF(setcond_i64, 1, 2, 1, IMPL64)
- DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
- /* load/store */
-@@ -XXX,XX +XXX,XX @@ DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
- #define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
- DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
--DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
- DEF(dup_vec, 1, 1, 0, IMPLVEC)
- DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         switch (opc) {
          CASE_OP_32_64_VEC(mov):
              tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-             break;
--        CASE_OP_32_64(movi):
--        case INDEX_op_dupi_vec:
--            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
 -            break;
++            continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
-diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
+                 tmp = arg_info(op->args[1])->val;
-index XXXXXXX..XXXXXXX 100644
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
---- a/tcg/tcg-op-vec.c
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-+++ b/tcg/tcg-op-vec.c
+-                break;
-@@ -XXX,XX +XXX,XX @@ bool tcg_can_emit_vecop_list(const TCGOpcode *list,
++                continue;
-         case INDEX_op_xor_vec:
+             }
-         case INDEX_op_mov_vec:
+-            goto do_default;
-         case INDEX_op_dup_vec:
++            break;
--        case INDEX_op_dupi_vec:
          case INDEX_op_dup2_vec:
-         case INDEX_op_ld_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
-         case INDEX_op_st_vec:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
-index XXXXXXX..XXXXXXX 100644
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
---- a/tcg/tcg.c
+                                            arg_info(op->args[2])->val));
-+++ b/tcg/tcg.c
+-                break;
-@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
++                continue;
-         return TCG_TARGET_HAS_goto_ptr;
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
-     case INDEX_op_mov_i32:
+                 TCGOP_VECE(op) = MO_32;
--    case INDEX_op_movi_i32:
+                 nb_iargs = 1;
-     case INDEX_op_setcond_i32:
+             }
-     case INDEX_op_brcond_i32:
+-            goto do_default;
-     case INDEX_op_ld8u_i32:
++            break;
-@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
-         return TCG_TARGET_REG_BITS == 32;
+         CASE_OP_32_64(not):
+         CASE_OP_32_64(neg):
-     case INDEX_op_mov_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    case INDEX_op_movi_i64:
+             if (arg_is_const(op->args[1])) {
-     case INDEX_op_setcond_i64:
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-     case INDEX_op_brcond_i64:
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-     case INDEX_op_ld8u_i64:
+-                break;
-@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
++                continue;
+             }
-     case INDEX_op_mov_vec:
+-            goto do_default;
-     case INDEX_op_dup_vec:
++            break;
--    case INDEX_op_dupi_vec:
-     case INDEX_op_dupm_vec:
+         CASE_OP_32_64(bswap16):
-     case INDEX_op_ld_vec:
+         CASE_OP_32_64(bswap32):
-     case INDEX_op_st_vec:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
- }
+                                           op->args[2]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
- /*
+-                break;
-- * Specialized code generation for INDEX_op_movi_*.
++                continue;
-+ * Specialized code generation for INDEX_op_mov_* with a constant.
+             }
-  */
+-            goto do_default;
- static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
++            break;
-                                   tcg_target_ulong val, TCGLifeData arg_life,
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
+         CASE_OP_32_64(add):
-     }
+         CASE_OP_32_64(sub):
- }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
--static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
+                                           arg_info(op->args[2])->val);
--{
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
--    TCGTemp *ots = arg_temp(op->args[0]);
+-                break;
--    tcg_target_ulong val = op->args[1];
++                continue;
--
+             }
--    tcg_reg_alloc_do_movi(s, ots, val, op->life, op->output_pref[0]);
+-            goto do_default;
--}
++            break;
--
- /*
+         CASE_OP_32_64(clz):
-  * Specialized code generation for INDEX_op_mov_*.
+         CASE_OP_32_64(ctz):
-  */
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+                 } else {
-         case INDEX_op_mov_vec:
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
-             tcg_reg_alloc_mov(s, op);
+                 }
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(deposit):
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(extract):
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(sextract):
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(extract2):
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                      ((uint32_t)v2 << (32 - shr)));
                  }
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(setcond):
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(brcond):
              tmp = do_constant_folding_cond(opc, op->args[0],
                                             op->args[1], op->args[2]);
 -            if (tmp != 2) {
 -                if (tmp) {
 -                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                    op->opc = INDEX_op_br;
 -                    op->args[0] = op->args[3];
 -                } else {
 -                    tcg_op_remove(s, op);
 -                }
 +            switch (tmp) {
 +            case 0:
 +                tcg_op_remove(s, op);
 +                continue;
 +            case 1:
 +                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 +                op->opc = opc = INDEX_op_br;
 +                op->args[0] = op->args[3];
                  break;
              }
 -            goto do_default;
 +            break;
          CASE_OP_32_64(movcond):
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
                  tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 -                break;
 +                continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                  uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  if (fv == 1 && tv == 0) {
                      cond = tcg_invert_cond(cond);
                  } else if (!(tv == 1 && fv == 0)) {
 -                    goto do_default;
 +                    break;
                  }
                  op->args[3] = cond;
                  op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                   : INDEX_op_setcond_i64);
                  nb_iargs = 2;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_add2_i32:
          case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rh = op->args[1];
                  tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                  tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_mulu2_i32:
              if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rh = op->args[1];
                  tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                  tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 -                break;
 +                continue;
              }
 -            goto do_default;
 +            break;
          case INDEX_op_brcond2_i32:
              tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                              op->args[4]);
 -            if (tmp != 2) {
 -                if (tmp) {
 -            do_brcond_true:
 -                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                    op->opc = INDEX_op_br;
 -                    op->args[0] = op->args[5];
 -                } else {
 +            if (tmp == 0) {
              do_brcond_false:
 -                    tcg_op_remove(s, op);
 -                }
 -            } else if ((op->args[4] == TCG_COND_LT
 -                        || op->args[4] == TCG_COND_GE)
 -                       && arg_is_const(op->args[2])
 -                       && arg_info(op->args[2])->val == 0
 -                       && arg_is_const(op->args[3])
 -                       && arg_info(op->args[3])->val == 0) {
 +                tcg_op_remove(s, op);
 +                continue;
 +            }
 +            if (tmp == 1) {
 +            do_brcond_true:
 +                op->opc = opc = INDEX_op_br;
 +                op->args[0] = op->args[5];
 +                break;
 +            }
 +            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
 +                 && arg_is_const(op->args[2])
 +                 && arg_info(op->args[2])->val == 0
 +                 && arg_is_const(op->args[3])
 +                 && arg_info(op->args[3])->val == 0) {
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_brcond_high:
 -                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -                op->opc = INDEX_op_brcond_i32;
 +                op->opc = opc = INDEX_op_brcond_i32;
                  op->args[0] = op->args[1];
                  op->args[1] = op->args[3];
                  op->args[2] = op->args[4];
                  op->args[3] = op->args[5];
 -            } else if (op->args[4] == TCG_COND_EQ) {
 +                break;
 +            }
 +            if (op->args[4] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  if (tmp == 0) {
                      goto do_brcond_false;
                  } else if (tmp != 1) {
 -                    goto do_default;
 +                    break;
                  }
              do_brcond_low:
                  memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
                  op->args[3] = op->args[5];
 -            } else if (op->args[4] == TCG_COND_NE) {
 +                break;
 +            }
 +            if (op->args[4] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  } else if (tmp == 1) {
                      goto do_brcond_true;
                  }
 -                goto do_default;
 -            } else {
 -                goto do_default;
              }
              break;
--        case INDEX_op_movi_i32:
--        case INDEX_op_movi_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        case INDEX_op_dupi_vec:
+             if (tmp != 2) {
--            tcg_reg_alloc_movi(s, op);
+             do_setcond_const:
                  tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 -            } else if ((op->args[5] == TCG_COND_LT
 -                        || op->args[5] == TCG_COND_GE)
 -                       && arg_is_const(op->args[3])
 -                       && arg_info(op->args[3])->val == 0
 -                       && arg_is_const(op->args[4])
 -                       && arg_info(op->args[4])->val == 0) {
 +                continue;
 +            }
 +            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 +                 && arg_is_const(op->args[3])
 +                 && arg_info(op->args[3])->val == 0
 +                 && arg_is_const(op->args[4])
 +                 && arg_info(op->args[4])->val == 0) {
                  /* Simplify LT/GE comparisons vs zero to a single compare
                     vs the high word of the input.  */
              do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
                  op->args[3] = op->args[5];
 -            } else if (op->args[5] == TCG_COND_EQ) {
 +                break;
 +            }
 +            if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  if (tmp == 0) {
                      goto do_setcond_high;
                  } else if (tmp != 1) {
 -                    goto do_default;
 +                    break;
                  }
              do_setcond_low:
                  reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  op->opc = INDEX_op_setcond_i32;
                  op->args[2] = op->args[3];
                  op->args[3] = op->args[5];
 -            } else if (op->args[5] == TCG_COND_NE) {
 +                break;
 +            }
 +            if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
                  tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  } else if (tmp == 1) {
                      goto do_setcond_const;
                  }
 -                goto do_default;
 -            } else {
 -                goto do_default;
              }
              break;
 -        case INDEX_op_call:
 -            if (!(tcg_call_flags(op)
 +        default:
 +            break;
 +        }
 +
 +        /* Some of the folding above can change opc. */
 +        opc = op->opc;
 +        def = &tcg_op_defs[opc];
 +        if (def->flags & TCG_OPF_BB_END) {
 +            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 +        } else {
 +            if (opc == INDEX_op_call &&
 +                !(tcg_call_flags(op)
                    & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                  for (i = 0; i < nb_globals; i++) {
                      if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      }
                  }
              }
 -            goto do_reset_output;
 -        default:
 -        do_default:
 -            /* Default case: we know nothing about operation (or were unable
 -               to compute the operation result) so no propagation is done.
 -               We trash everything if the operation is the end of a basic
 -               block, otherwise we only trash the output args.  "z_mask" is
 -               the non-zero bits mask for the first output arg.  */
 -            if (def->flags & TCG_OPF_BB_END) {
 -                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -            } else {
 -        do_reset_output:
 -                for (i = 0; i < nb_oargs; i++) {
 -                    reset_temp(op->args[i]);
 -                    /* Save the corresponding known-zero bits mask for the
 -                       first output argument (only one supported so far). */
 -                    if (i == 0) {
 -                        arg_info(op->args[i])->z_mask = z_mask;
 -                    }
 +            for (i = 0; i < nb_oargs; i++) {
 +                reset_temp(op->args[i]);
 +                /* Save the corresponding known-zero bits mask for the
 +                   first output argument (only one supported so far). */
 +                if (i == 0) {
 +                    arg_info(op->args[i])->z_mask = z_mask;
                  }
              }
 -            break;
-         case INDEX_op_dup_vec:
+         }
-             tcg_reg_alloc_dup(s, op);
-             break;
+         /* Eliminate duplicate and redundant fence instructions.  */
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32:  /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          return;
      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
 -    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          g_assert_not_reached();
 diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.c.inc
 +++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          break;
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
 -    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
 -    case INDEX_op_movi_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
      default:
          tcg_abort();
 --
 .25.1

-[PATCH 12/43] tcg: Increase tcg_out_dupi_vec immediate to int64_t
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
-While we don't store more than tcg_target_long in TCGTemp,
+Adjust the interface to take the OptContext parameter instead
-we shouldn't be limited to that for code generation.  We will
+of TCGContext or both.
-be able to use this for INDEX_op_dup2_vec with 2 constants.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Also pass along the minimal vece that may be said to apply
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 to the constant.  This allows some simplification in the
 various backends.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c                    | 31 +++++++++++++++++++++++++-----
+ tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
- tcg/aarch64/tcg-target.c.inc | 12 ++++++------
+file changed, 34 insertions(+), 33 deletions(-)
- tcg/i386/tcg-target.c.inc    | 22 ++++++++++++---------
- tcg/ppc/tcg-target.c.inc     | 37 +++++++++++++++++++++++-------------
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 files changed, 69 insertions(+), 33 deletions(-)
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-                             TCGReg dst, TCGReg src);
+ } TempOptInfo;
- static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
-                              TCGReg dst, TCGReg base, intptr_t offset);
+ typedef struct OptContext {
--static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
++    TCGContext *tcg;
--                             TCGReg dst, tcg_target_long arg);
+     TCGTempSet temps_used;
-+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+ } OptContext;
-+                             TCGReg dst, int64_t arg);
- static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-                            unsigned vece, const TCGArg *args,
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
-                            const int *const_args);
+ }
-@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 -static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
-     g_assert_not_reached();
+     TCGTemp *dst_ts = arg_temp(dst);
- }
+     TCGTemp *src_ts = arg_temp(src);
--static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
--                                    TCGReg dst, tcg_target_long arg)
+     TCGOpcode new_op;
-+static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
-+                                    TCGReg dst, int64_t arg)
+     if (ts_are_copies(dst_ts, src_ts)) {
- {
+-        tcg_op_remove(s, op);
-     g_assert_not_reached();
++        tcg_op_remove(ctx->tcg, op);
  }
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
          if (ts->type <= TCG_TYPE_I64) {
              tcg_out_movi(s, ts->type, reg, ts->val);
          } else {
 -            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
 +            uint64_t val = ts->val;
 +            MemOp vece = MO_64;
 +
 +            /*
 +             * Find the minimal vector element that matches the constant.
 +             * The targets will, in general, have to do this search anyway,
 +             * do this generically.
 +             */
 +            if (TCG_TARGET_REG_BITS == 32) {
 +                val = dup_const(MO_32, val);
 +                vece = MO_32;
 +            }
 +            if (val == dup_const(MO_8, val)) {
 +                vece = MO_8;
 +            } else if (val == dup_const(MO_16, val)) {
 +                vece = MO_16;
 +            } else if (TCG_TARGET_REG_BITS == 64 &&
 +                       val == dup_const(MO_32, val)) {
 +                vece = MO_32;
 +            }
 +
 +            tcg_out_dupi_vec(s, ts->type, vece, reg, ts->val);
          }
          ts->mem_coherent = 0;
          break;
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
      tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
  }
 -static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 -                             TCGReg rd, tcg_target_long v64)
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg rd, int64_t v64)
  {
      bool q = type == TCG_TYPE_V128;
      int cmode, imm8, i;
      /* Test all bytes equal first.  */
 -    if (v64 == dup_const(MO_8, v64)) {
 +    if (vece == MO_8) {
          imm8 = (uint8_t)v64;
          tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
          return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
       * cannot find an expansion there's no point checking a larger
       * width because we already know by replication it cannot match.
       */
 -    if (v64 == dup_const(MO_16, v64)) {
 +    if (vece == MO_16) {
          uint16_t v16 = v64;
          if (is_shimm16(v16, &cmode, &imm8)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
          tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
          tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
          return;
 -    } else if (v64 == dup_const(MO_32, v64)) {
 +    } else if (vece == MO_32) {
          uint32_t v32 = v64;
          uint32_t n32 = ~v32;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                          tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
                          break;
                      }
 -                    tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0);
 +                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                      a2 = TCG_VEC_TMP;
                  }
                  insn = cmp_insn[cond];
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
      return true;
  }
 -static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
 -                             TCGReg ret, tcg_target_long arg)
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg ret, int64_t arg)
  {
      int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
          return;
      }
--    if (TCG_TARGET_REG_BITS == 64) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 +        if (have_avx2) {
 +            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 +        } else {
 +            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 +        }
 +        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 +    } else {
          if (type == TCG_TYPE_V64) {
              tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
          } else if (have_avx2) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
          } else {
              tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
          }
 -        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 -    } else {
 -        if (have_avx2) {
 -            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 +        if (TCG_TARGET_REG_BITS == 64) {
 +            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
          } else {
 -            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 +            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
          }
 -        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
      }
  }
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-index XXXXXXX..XXXXXXX 100644
+-                             TCGOp *op, TCGArg dst, uint64_t val)
---- a/tcg/ppc/tcg-target.c.inc
++static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-+++ b/tcg/ppc/tcg-target.c.inc
++                             TCGArg dst, uint64_t val)
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
+ {
-     }
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
      init_ts_info(ctx, tv);
 -    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
--static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
--                             tcg_target_long val)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 +                             TCGReg ret, int64_t val)
  {
-     uint32_t load_insn;
+     int nb_temps, nb_globals, i;
-     int rel, low;
+     TCGOp *op, *op_next, *prev_mb = NULL;
-     intptr_t add;
+-    OptContext ctx = {};
++    OptContext ctx = { .tcg = s };
--    low = (int8_t)val;
--    if (low >= -16 && low < 16) {
+     /* Array VALS has an element for each temp.
--        if (val == (tcg_target_long)dup_const(MO_8, low)) {
+        If this temp holds a constant then its value is kept in VALS' element.
-+    switch (vece) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    case MO_8:
+         CASE_OP_32_64(rotr):
-+        low = (int8_t)val;
+             if (arg_is_const(op->args[1])
-+        if (low >= -16 && low < 16) {
+                 && arg_info(op->args[1])->val == 0) {
-             tcg_out32(s, VSPLTISB | VRT(ret) | ((val & 31) << 16));
+-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
-             return;
++                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
--        if (val == (tcg_target_long)dup_const(MO_16, low)) {
+         if (affected == 0) {
-+        if (have_isa_3_00) {
+             tcg_debug_assert(nb_oargs == 1);
-+            tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
+-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-+            return;
++            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+        }
+             continue;
 +        break;
 +
 +    case MO_16:
 +        low = (int16_t)val;
 +        if (low >= -16 && low < 16) {
              tcg_out32(s, VSPLTISH | VRT(ret) | ((val & 31) << 16));
              return;
          }
--        if (val == (tcg_target_long)dup_const(MO_32, low)) {
-+        break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+
+         CASE_OP_32_64(mulsh):
-+    case MO_32:
+             if (arg_is_const(op->args[2])
-+        low = (int32_t)val;
+                 && arg_info(op->args[2])->val == 0) {
-+        if (low >= -16 && low < 16) {
+-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
-             tcg_out32(s, VSPLTISW | VRT(ret) | ((val & 31) << 16));
++                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-             return;
+                 continue;
-         }
+             }
--    }
+             break;
--    if (have_isa_3_00 && val == (tcg_target_long)dup_const(MO_8, val)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
+         CASE_OP_32_64_VEC(or):
--        return;
+         CASE_OP_32_64_VEC(and):
-+        break;
+             if (args_are_copies(op->args[1], op->args[2])) {
-     }
+-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
++                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-     /*
+                 continue;
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
+             }
-         if (TCG_TARGET_REG_BITS == 64) {
+             break;
-             new_pool_label(s, val, rel, s->code_ptr, add);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         } else {
+         CASE_OP_32_64_VEC(sub):
--            new_pool_l2(s, rel, s->code_ptr, add, val, val);
+         CASE_OP_32_64_VEC(xor):
-+            new_pool_l2(s, rel, s->code_ptr, add, val >> 32, val);
+             if (args_are_copies(op->args[1], op->args[2])) {
-         }
+-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
-     } else {
++                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-         load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
+                 continue;
-         if (TCG_TARGET_REG_BITS == 64) {
+             }
-             new_pool_l2(s, rel, s->code_ptr, add, val, val);
+             break;
-         } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
+            allocator where needed and possible.  Also detect copies. */
-+            new_pool_l4(s, rel, s->code_ptr, add,
+         switch (opc) {
-+                        val >> 32, val, val >> 32, val);
+         CASE_OP_32_64_VEC(mov):
-         }
+-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-     }
++            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-New patch
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
+This will expose the variable to subroutines that
+will be broken out of tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+ typedef struct OptContext {
+     TCGContext *tcg;
++    TCGOp *prev_mb;
+     TCGTempSet temps_used;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+-    TCGOp *op, *op_next, *prev_mb = NULL;
++    TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+     /* Array VALS has an element for each temp.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+-        if (prev_mb) {
++        if (ctx.prev_mb) {
+             switch (opc) {
+             case INDEX_op_mb:
+                 /* Merge two barriers of the same type into one,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                  * barrier.  This is stricter than specified but for
+                  * the purposes of TCG is better than not optimizing.
+                  */
+-                prev_mb->args[0] |= op->args[0];
++                ctx.prev_mb->args[0] |= op->args[0];
+                 tcg_op_remove(s, op);
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i64:
+             case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+-                prev_mb = NULL;
++                ctx.prev_mb = NULL;
+                 break;
+             }
+         } else if (opc == INDEX_op_mb) {
+-            prev_mb = op;
++            ctx.prev_mb = op;
+         }
+     }
+ }
+--
+.25.1

-[PATCH 20/43] tcg/optimize: Adjust TempOptInfo allocation
+[PULL 11/56] tcg/optimize: Split out init_arguments
-Do not allocate a large block for indexing.  Instead, allocate
+There was no real reason for calls to have separate code here.
-for each temporary as they are seen.
+Unify init for calls vs non-calls using the call path, which
+handles TCG_CALL_DUMMY_ARG.
 In general, this will use less memory, if we consider that most
 TBs do not touch every target register.  This also allows us to
 allocate TempOptInfo for new temps created during optimization.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 60 ++++++++++++++++++++++++++++----------------------
+ tcg/optimize.c | 25 +++++++++++--------------
-file changed, 34 insertions(+), 26 deletions(-)
+file changed, 11 insertions(+), 14 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
  }
  /* Initialize and activate a temporary.  */
 -static void init_ts_info(TempOptInfo *infos,
 -                         TCGTempSet *temps_used, TCGTemp *ts)
 +static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
  {
      size_t idx = temp_idx(ts);
 -    if (!test_bit(idx, temps_used->l)) {
 -        TempOptInfo *ti = &infos[idx];
 +    TempOptInfo *ti;
 +    if (test_bit(idx, temps_used->l)) {
 +        return;
 +    }
 +    set_bit(idx, temps_used->l);
 +
 +    ti = ts->state_ptr;
 +    if (ti == NULL) {
 +        ti = tcg_malloc(sizeof(TempOptInfo));
          ts->state_ptr = ti;
 -        ti->next_copy = ts;
 -        ti->prev_copy = ts;
 -        if (ts->kind == TEMP_CONST) {
 -            ti->is_const = true;
 -            ti->val = ti->mask = ts->val;
 -            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
 -                /* High bits of a 32-bit quantity are garbage.  */
 -                ti->mask |= ~0xffffffffull;
 -            }
 -        } else {
 -            ti->is_const = false;
 -            ti->mask = -1;
 +    }
 +
 +    ti->next_copy = ts;
 +    ti->prev_copy = ts;
 +    if (ts->kind == TEMP_CONST) {
 +        ti->is_const = true;
 +        ti->val = ts->val;
 +        ti->mask = ts->val;
 +        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
 +            /* High bits of a 32-bit quantity are garbage.  */
 +            ti->mask |= ~0xffffffffull;
          }
 -        set_bit(idx, temps_used->l);
 +    } else {
 +        ti->is_const = false;
 +        ti->mask = -1;
      }
  }
--static void init_arg_info(TempOptInfo *infos,
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
--                          TCGTempSet *temps_used, TCGArg arg)
+-{
-+static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+-    init_ts_info(ctx, arg_temp(arg));
 -}
 -
  static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
  {
--    init_ts_info(infos, temps_used, arg_temp(arg));
+     TCGTemp *i, *g, *l;
-+    init_ts_info(temps_used, arg_temp(arg));
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
      return false;
  }
- static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
-@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
++{
 +    for (int i = 0; i < nb_args; i++) {
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        if (ts) {
 +            init_ts_info(ctx, ts);
 +        }
 +    }
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
--    int nb_temps, nb_globals;
-+    int nb_temps, nb_globals, i;
-     TCGOp *op, *op_next, *prev_mb = NULL;
--    TempOptInfo *infos;
-     TCGTempSet temps_used;
-     /* Array VALS has an element for each temp.
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (opc == INDEX_op_call) {
-     nb_temps = s->nb_temps;
+             nb_oargs = TCGOP_CALLO(op);
-     nb_globals = s->nb_globals;
+             nb_iargs = TCGOP_CALLI(op);
-+
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-     bitmap_zero(temps_used.l, nb_temps);
+-                TCGTemp *ts = arg_temp(op->args[i]);
--    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
+-                if (ts) {
-+    for (i = 0; i < nb_temps; ++i) {
+-                    init_ts_info(&ctx, ts);
-+        s->temps[i].state_ptr = NULL;
+-                }
-+    }
+-            }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
          uint64_t mask, partmask, affected, tmp;
 -        int nb_oargs, nb_iargs, i;
 +        int nb_oargs, nb_iargs;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              for (i = 0; i < nb_oargs + nb_iargs; i++) {
                  TCGTemp *ts = arg_temp(op->args[i]);
                  if (ts) {
 -                    init_ts_info(infos, &temps_used, ts);
 +                    init_ts_info(&temps_used, ts);
                  }
              }
          } else {
              nb_oargs = def->nb_oargs;
              nb_iargs = def->nb_iargs;
-             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
--                init_arg_info(infos, &temps_used, op->args[i]);
+-                init_arg_info(&ctx, op->args[i]);
-+                init_arg_info(&temps_used, op->args[i]);
+-            }
              }
          }
++        init_arguments(&ctx, op, nb_oargs + nb_iargs);
          /* Do copy propagation */
          for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 --
 .25.1

-[PATCH 42/43] tcg/sparc: Convert to tcg-constr.c.inc
+[PULL 12/56] tcg/optimize: Split out copy_propagate
+Continue splitting tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/sparc/tcg-target-constr.h | 27 +++++++++++++
+ tcg/optimize.c | 22 ++++++++++++++--------
- tcg/sparc/tcg-target.c.inc    | 74 ++++++++++++-----------------------
+file changed, 14 insertions(+), 8 deletions(-)
 files changed, 51 insertions(+), 50 deletions(-)
  create mode 100644 tcg/sparc/tcg-target-constr.h
-diff --git a/tcg/sparc/tcg-target-constr.h b/tcg/sparc/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/sparc/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * Sparc target-specific operand constaints.
 + * Copyright (c) 2020 Linaro
 + */
 +
 +C_O0_I1(r)
 +C_O0_I2(rZ, r)
 +C_O0_I2(RZ, r)
 +C_O0_I2(rZ, rJ)
 +C_O0_I2(RZ, RJ)
 +C_O0_I2(sZ, A)
 +C_O0_I2(SZ, A)
 +C_O1_I1(r, A)
 +C_O1_I1(R, A)
 +C_O1_I1(r, r)
 +C_O1_I1(r, R)
 +C_O1_I1(R, r)
 +C_O1_I1(R, R)
 +C_O1_I2(R, R, R)
 +C_O1_I2(r, rZ, rJ)
 +C_O1_I2(R, RZ, RJ)
 +C_O1_I4(r, rZ, rJ, rI, 0)
 +C_O1_I4(R, RZ, RJ, RI, 0)
 +C_O2_I2(r, r, rZ, rJ)
 +C_O2_I4(R, R, RZ, RZ, RJ, RI)
 +C_O2_I4(r, r, rZ, rZ, rJ, rJ)
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/sparc/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/sparc/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
      }
  }
-+/* Define all constraint sets. */
++static void copy_propagate(OptContext *ctx, TCGOp *op,
-+#include "../tcg-constr.c.inc"
++                           int nb_oargs, int nb_iargs)
 +{
 +    TCGContext *s = ctx->tcg;
 +
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        if (ts && ts_is_copy(ts)) {
 +            op->args[i] = temp_arg(find_better_copy(s, ts));
 +        }
 +    }
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+             nb_iargs = def->nb_iargs;
--    static const TCGTargetOpDef R_r = { .args_ct_str = { "R", "r" } };
+         }
--    static const TCGTargetOpDef r_R = { .args_ct_str = { "r", "R" } };
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
 -    static const TCGTargetOpDef R_R = { .args_ct_str = { "R", "R" } };
 -    static const TCGTargetOpDef r_A = { .args_ct_str = { "r", "A" } };
 -    static const TCGTargetOpDef R_A = { .args_ct_str = { "R", "A" } };
 -    static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
 -    static const TCGTargetOpDef RZ_r = { .args_ct_str = { "RZ", "r" } };
 -    static const TCGTargetOpDef sZ_A = { .args_ct_str = { "sZ", "A" } };
 -    static const TCGTargetOpDef SZ_A = { .args_ct_str = { "SZ", "A" } };
 -    static const TCGTargetOpDef rZ_rJ = { .args_ct_str = { "rZ", "rJ" } };
 -    static const TCGTargetOpDef RZ_RJ = { .args_ct_str = { "RZ", "RJ" } };
 -    static const TCGTargetOpDef R_R_R = { .args_ct_str = { "R", "R", "R" } };
 -    static const TCGTargetOpDef r_rZ_rJ
 -        = { .args_ct_str = { "r", "rZ", "rJ" } };
 -    static const TCGTargetOpDef R_RZ_RJ
 -        = { .args_ct_str = { "R", "RZ", "RJ" } };
 -    static const TCGTargetOpDef r_r_rZ_rJ
 -        = { .args_ct_str = { "r", "r", "rZ", "rJ" } };
 -    static const TCGTargetOpDef movc_32
 -        = { .args_ct_str = { "r", "rZ", "rJ", "rI", "0" } };
 -    static const TCGTargetOpDef movc_64
 -        = { .args_ct_str = { "R", "RZ", "RJ", "RI", "0" } };
 -    static const TCGTargetOpDef add2_32
 -        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rJ", "rJ" } };
 -    static const TCGTargetOpDef add2_64
 -        = { .args_ct_str = { "R", "R", "RZ", "RZ", "RJ", "RI" } };
 -
-     switch (op) {
+-        /* Do copy propagation */
-     case INDEX_op_goto_ptr:
+-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
--        return &r;
+-            TCGTemp *ts = arg_temp(op->args[i]);
-+        return C_O0_I1(r);
+-            if (ts && ts_is_copy(ts)) {
+-                op->args[i] = temp_arg(find_better_copy(s, ts));
-     case INDEX_op_ld8u_i32:
+-            }
-     case INDEX_op_ld8s_i32:
+-        }
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
-     case INDEX_op_ld_i32:
-     case INDEX_op_neg_i32:
+         /* For commutative operations make constant second argument */
-     case INDEX_op_not_i32:
+         switch (opc) {
 -        return &r_r;
 +        return C_O1_I1(r, r);
      case INDEX_op_st8_i32:
      case INDEX_op_st16_i32:
      case INDEX_op_st_i32:
 -        return &rZ_r;
 +        return C_O0_I2(rZ, r);
      case INDEX_op_add_i32:
      case INDEX_op_mul_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_shr_i32:
      case INDEX_op_sar_i32:
      case INDEX_op_setcond_i32:
 -        return &r_rZ_rJ;
 +        return C_O1_I2(r, rZ, rJ);
      case INDEX_op_brcond_i32:
 -        return &rZ_rJ;
 +        return C_O0_I2(rZ, rJ);
      case INDEX_op_movcond_i32:
 -        return &movc_32;
 +        return C_O1_I4(r, rZ, rJ, rI, 0);
      case INDEX_op_add2_i32:
      case INDEX_op_sub2_i32:
 -        return &add2_32;
 +        return C_O2_I4(r, r, rZ, rZ, rJ, rJ);
      case INDEX_op_mulu2_i32:
      case INDEX_op_muls2_i32:
 -        return &r_r_rZ_rJ;
 +        return C_O2_I2(r, r, rZ, rJ);
      case INDEX_op_ld8u_i64:
      case INDEX_op_ld8s_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_ld_i64:
      case INDEX_op_ext_i32_i64:
      case INDEX_op_extu_i32_i64:
 -        return &R_r;
 +        return C_O1_I1(R, r);
      case INDEX_op_st8_i64:
      case INDEX_op_st16_i64:
      case INDEX_op_st32_i64:
      case INDEX_op_st_i64:
 -        return &RZ_r;
 +        return C_O0_I2(RZ, r);
      case INDEX_op_add_i64:
      case INDEX_op_mul_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_shr_i64:
      case INDEX_op_sar_i64:
      case INDEX_op_setcond_i64:
 -        return &R_RZ_RJ;
 +        return C_O1_I2(R, RZ, RJ);
      case INDEX_op_neg_i64:
      case INDEX_op_not_i64:
      case INDEX_op_ext32s_i64:
      case INDEX_op_ext32u_i64:
 -        return &R_R;
 +        return C_O1_I1(R, R);
      case INDEX_op_extrl_i64_i32:
      case INDEX_op_extrh_i64_i32:
 -        return &r_R;
 +        return C_O1_I1(r, R);
      case INDEX_op_brcond_i64:
 -        return &RZ_RJ;
 +        return C_O0_I2(RZ, RJ);
      case INDEX_op_movcond_i64:
 -        return &movc_64;
 +        return C_O1_I4(R, RZ, RJ, RI, 0);
      case INDEX_op_add2_i64:
      case INDEX_op_sub2_i64:
 -        return &add2_64;
 +        return C_O2_I4(R, R, RZ, RZ, RJ, RI);
      case INDEX_op_muluh_i64:
 -        return &R_R_R;
 +        return C_O1_I2(R, R, R);
      case INDEX_op_qemu_ld_i32:
 -        return &r_A;
 +        return C_O1_I1(r, A);
      case INDEX_op_qemu_ld_i64:
 -        return &R_A;
 +        return C_O1_I1(R, A);
      case INDEX_op_qemu_st_i32:
 -        return &sZ_A;
 +        return C_O0_I2(sZ, A);
      case INDEX_op_qemu_st_i64:
 -        return &SZ_A;
 +        return C_O0_I2(SZ, A);
      default:
          return NULL;
 --
 .25.1

-[PATCH 41/43] tcg/s390: Convert to tcg-constr.c.inc
+[PULL 13/56] tcg/optimize: Split out fold_call
+Calls are special in that they have a variable number
+of arguments, and need to be able to clobber globals.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/s390/tcg-target-constr.h |  24 +++++++
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
- tcg/s390/tcg-target.c.inc    | 119 +++++++++++++++--------------------
+file changed, 41 insertions(+), 22 deletions(-)
 files changed, 76 insertions(+), 67 deletions(-)
  create mode 100644 tcg/s390/tcg-target-constr.h
-diff --git a/tcg/s390/tcg-target-constr.h b/tcg/s390/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/s390/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * S390 target-specific operand constaints.
 + * Copyright (c) 2020 Linaro
 + */
 +
 +C_O0_I1(r)
 +C_O0_I2(L, L)
 +C_O0_I2(r, r)
 +C_O0_I2(r, ri)
 +C_O1_I1(r, L)
 +C_O1_I1(r, r)
 +C_O1_I2(r, 0, ri)
 +C_O1_I2(r, 0, rI)
 +C_O1_I2(r, 0, rJ)
 +C_O1_I2(r, r, ri)
 +C_O1_I2(r, rZ, r)
 +C_O1_I4(r, r, ri, r, 0)
 +C_O1_I4(r, r, ri, rI, 0)
 +C_O2_I2(b, a, 0, r)
 +C_O2_I3(b, a, 0, 1, r)
 +C_O2_I4(r, r, 0, 1, rA, r)
 +C_O2_I4(r, r, 0, 1, ri, r)
 +C_O2_I4(r, r, 0, 1, r, r)
 diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/s390/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/s390/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
      }
  }
-+/* Define all constraint sets. */
++static bool fold_call(OptContext *ctx, TCGOp *op)
-+#include "../tcg-constr.c.inc"
++{
 +    TCGContext *s = ctx->tcg;
 +    int nb_oargs = TCGOP_CALLO(op);
 +    int nb_iargs = TCGOP_CALLI(op);
 +    int flags, i;
 +
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
 +    copy_propagate(ctx, op, nb_oargs, nb_iargs);
 +
 +    /* If the function reads or writes globals, reset temp data. */
 +    flags = tcg_call_flags(op);
 +    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
 +        int nb_globals = s->nb_globals;
 +
 +        for (i = 0; i < nb_globals; i++) {
 +            if (test_bit(i, ctx->temps_used.l)) {
 +                reset_ts(&ctx->tcg->temps[i]);
 +            }
 +        }
 +    }
 +
 +    /* Reset temp data for outputs. */
 +    for (i = 0; i < nb_oargs; i++) {
 +        reset_temp(op->args[i]);
 +    }
 +
 +    /* Stop optimizing MB across calls. */
 +    ctx->prev_mb = NULL;
 +    return true;
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
+-    int nb_temps, nb_globals, i;
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
++    int nb_temps, i;
--    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
+     TCGOp *op, *op_next;
--    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
+     OptContext ctx = { .tcg = s };
--    static const TCGTargetOpDef r_ri = { .args_ct_str = { "r", "ri" } };
--    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    static const TCGTargetOpDef r_0_ri = { .args_ct_str = { "r", "0", "ri" } };
+        available through the doubly linked circular list. */
--    static const TCGTargetOpDef r_0_rI = { .args_ct_str = { "r", "0", "rI" } };
--    static const TCGTargetOpDef r_0_rJ = { .args_ct_str = { "r", "0", "rJ" } };
+     nb_temps = s->nb_temps;
--    static const TCGTargetOpDef a2_r
+-    nb_globals = s->nb_globals;
 -        = { .args_ct_str = { "r", "r", "0", "1", "r", "r" } };
 -    static const TCGTargetOpDef a2_ri
 -        = { .args_ct_str = { "r", "r", "0", "1", "ri", "r" } };
 -    static const TCGTargetOpDef a2_rA
 -        = { .args_ct_str = { "r", "r", "0", "1", "rA", "r" } };
 -
-     switch (op) {
+     for (i = 0; i < nb_temps; ++i) {
-     case INDEX_op_goto_ptr:
+         s->temps[i].state_ptr = NULL;
--        return &r;
+     }
-+        return C_O0_I1(r);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
-     case INDEX_op_ld8u_i32:
+         int nb_oargs, nb_iargs;
-     case INDEX_op_ld8u_i64:
+         TCGOpcode opc = op->opc;
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+-        const TCGOpDef *def = &tcg_op_defs[opc];
-     case INDEX_op_ld32u_i64:
++        const TCGOpDef *def;
-     case INDEX_op_ld32s_i64:
-     case INDEX_op_ld_i64:
+-        /* Count the arguments, and initialize the temps that are
-+        return C_O1_I1(r, r);
+-           going to be used */
 +        /* Calls are special. */
          if (opc == INDEX_op_call) {
 -            nb_oargs = TCGOP_CALLO(op);
 -            nb_iargs = TCGOP_CALLI(op);
 -        } else {
 -            nb_oargs = def->nb_oargs;
 -            nb_iargs = def->nb_iargs;
 +            fold_call(&ctx, op);
 +            continue;
          }
 +
-     case INDEX_op_st8_i32:
++        def = &tcg_op_defs[opc];
-     case INDEX_op_st8_i64:
++        nb_oargs = def->nb_oargs;
-     case INDEX_op_st16_i32:
++        nb_iargs = def->nb_iargs;
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-     case INDEX_op_st_i32:
+         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
-     case INDEX_op_st32_i64:
-     case INDEX_op_st_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        return &r_r;
+         if (def->flags & TCG_OPF_BB_END) {
-+        return C_O0_I2(r, r);
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
-     case INDEX_op_add_i32:
+-            if (opc == INDEX_op_call &&
-     case INDEX_op_add_i64:
+-                !(tcg_call_flags(op)
--        return &r_r_ri;
+-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-+    case INDEX_op_shl_i64:
+-                for (i = 0; i < nb_globals; i++) {
-+    case INDEX_op_shr_i64:
+-                    if (test_bit(i, ctx.temps_used.l)) {
-+    case INDEX_op_sar_i64:
+-                        reset_ts(&s->temps[i]);
-+    case INDEX_op_rotl_i32:
+-                    }
-+    case INDEX_op_rotl_i64:
+-                }
-+    case INDEX_op_rotr_i32:
+-            }
 +    case INDEX_op_rotr_i64:
 +    case INDEX_op_clz_i64:
 +    case INDEX_op_setcond_i32:
 +    case INDEX_op_setcond_i64:
 +        return C_O1_I2(r, r, ri);
 +
      case INDEX_op_sub_i32:
      case INDEX_op_sub_i64:
      case INDEX_op_and_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_or_i64:
      case INDEX_op_xor_i32:
      case INDEX_op_xor_i64:
 -        return (s390_facilities & FACILITY_DISTINCT_OPS ? &r_r_ri : &r_0_ri);
 +        return (s390_facilities & FACILITY_DISTINCT_OPS
 +                ? C_O1_I2(r, r, ri)
 +                : C_O1_I2(r, 0, ri));
      case INDEX_op_mul_i32:
          /* If we have the general-instruction-extensions, then we have
             MULTIPLY SINGLE IMMEDIATE with a signed 32-bit, otherwise we
             have only MULTIPLY HALFWORD IMMEDIATE, with a signed 16-bit.  */
 -        return (s390_facilities & FACILITY_GEN_INST_EXT ? &r_0_ri : &r_0_rI);
 +        return (s390_facilities & FACILITY_GEN_INST_EXT
 +                ? C_O1_I2(r, 0, ri)
 +                : C_O1_I2(r, 0, rI));
 +
      case INDEX_op_mul_i64:
 -        return (s390_facilities & FACILITY_GEN_INST_EXT ? &r_0_rJ : &r_0_rI);
 +        return (s390_facilities & FACILITY_GEN_INST_EXT
 +                ? C_O1_I2(r, 0, rJ)
 +                : C_O1_I2(r, 0, rI));
      case INDEX_op_shl_i32:
      case INDEX_op_shr_i32:
      case INDEX_op_sar_i32:
 -        return (s390_facilities & FACILITY_DISTINCT_OPS ? &r_r_ri : &r_0_ri);
 -
--    case INDEX_op_shl_i64:
+             for (i = 0; i < nb_oargs; i++) {
--    case INDEX_op_shr_i64:
+                 reset_temp(op->args[i]);
--    case INDEX_op_sar_i64:
+                 /* Save the corresponding known-zero bits mask for the
--        return &r_r_ri;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--
+             case INDEX_op_qemu_st_i32:
--    case INDEX_op_rotl_i32:
+             case INDEX_op_qemu_st8_i32:
--    case INDEX_op_rotl_i64:
+             case INDEX_op_qemu_st_i64:
--    case INDEX_op_rotr_i32:
+-            case INDEX_op_call:
--    case INDEX_op_rotr_i64:
+                 /* Opcodes that touch guest memory stop the optimization.  */
--        return &r_r_ri;
+                 ctx.prev_mb = NULL;
-+        return (s390_facilities & FACILITY_DISTINCT_OPS
+                 break;
 +                ? C_O1_I2(r, r, ri)
 +                : C_O1_I2(r, 0, ri));
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
 -        return &r_ri;
 +        return C_O0_I2(r, ri);
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_extu_i32_i64:
      case INDEX_op_extract_i32:
      case INDEX_op_extract_i64:
 -        return &r_r;
 -
 -    case INDEX_op_clz_i64:
 -    case INDEX_op_setcond_i32:
 -    case INDEX_op_setcond_i64:
 -        return &r_r_ri;
 +        return C_O1_I1(r, r);
      case INDEX_op_qemu_ld_i32:
      case INDEX_op_qemu_ld_i64:
 -        return &r_L;
 +        return C_O1_I1(r, L);
      case INDEX_op_qemu_st_i64:
      case INDEX_op_qemu_st_i32:
 -        return &L_L;
 +        return C_O0_I2(L, L);
      case INDEX_op_deposit_i32:
      case INDEX_op_deposit_i64:
 -        {
 -            static const TCGTargetOpDef dep
 -                = { .args_ct_str = { "r", "rZ", "r" } };
 -            return &dep;
 -        }
 +        return C_O1_I2(r, rZ, r);
 +
      case INDEX_op_movcond_i32:
      case INDEX_op_movcond_i64:
 -        {
 -            static const TCGTargetOpDef movc
 -                = { .args_ct_str = { "r", "r", "ri", "r", "0" } };
 -            static const TCGTargetOpDef movc_l
 -                = { .args_ct_str = { "r", "r", "ri", "rI", "0" } };
 -            return (s390_facilities & FACILITY_LOAD_ON_COND2 ? &movc_l : &movc);
 -        }
 +        return (s390_facilities & FACILITY_LOAD_ON_COND2
 +                ? C_O1_I4(r, r, ri, rI, 0)
 +                : C_O1_I4(r, r, ri, r, 0));
 +
      case INDEX_op_div2_i32:
      case INDEX_op_div2_i64:
      case INDEX_op_divu2_i32:
      case INDEX_op_divu2_i64:
 -        {
 -            static const TCGTargetOpDef div2
 -                = { .args_ct_str = { "b", "a", "0", "1", "r" } };
 -            return &div2;
 -        }
 +        return C_O2_I3(b, a, 0, 1, r);
 +
      case INDEX_op_mulu2_i64:
 -        {
 -            static const TCGTargetOpDef mul2
 -                = { .args_ct_str = { "b", "a", "0", "r" } };
 -            return &mul2;
 -        }
 +        return C_O2_I2(b, a, 0, r);
      case INDEX_op_add2_i32:
      case INDEX_op_sub2_i32:
 -        return (s390_facilities & FACILITY_EXT_IMM ? &a2_ri : &a2_r);
 +        return (s390_facilities & FACILITY_EXT_IMM
 +                ? C_O2_I4(r, r, 0, 1, ri, r)
 +                : C_O2_I4(r, r, 0, 1, r, r));
 +
      case INDEX_op_add2_i64:
      case INDEX_op_sub2_i64:
 -        return (s390_facilities & FACILITY_EXT_IMM ? &a2_rA : &a2_r);
 +        return (s390_facilities & FACILITY_EXT_IMM
 +                ? C_O2_I4(r, r, 0, 1, rA, r)
 +                : C_O2_I4(r, r, 0, 1, r, r));
      default:
          break;
 --
 .25.1

-New patch
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
+Rather than try to keep these up-to-date across folding,
+re-read nb_oargs at the end, after re-reading the opcode.
+A couple of asserts need dropping, but that will take care
+of itself as we split the function further.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 14 ++++----------
+file changed, 4 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+         uint64_t z_mask, partmask, affected, tmp;
+-        int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         def = &tcg_op_defs[opc];
+-        nb_oargs = def->nb_oargs;
+-        nb_iargs = def->nb_iargs;
+-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
++        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
++        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(qemu_ld):
+             {
+-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
++                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+                 MemOp mop = get_memop(oi);
+                 if (!(mop & MO_SIGN)) {
+                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         if (partmask == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             continue;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+-                nb_iargs = 1;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+                                  ? INDEX_op_setcond_i32
+                                  : INDEX_op_setcond_i64);
+-                nb_iargs = 2;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
++            int nb_oargs = def->nb_oargs;
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+--
+.25.1

-[PATCH 40/43] tcg/riscv: Convert to tcg-constr.c.inc
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
+Return -1 instead of 2 for failure, so that we can
+use comparisons against 0 for all cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/riscv/tcg-target-constr.h | 25 +++++++++++
+ tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
- tcg/riscv/tcg-target.c.inc    | 82 ++++++++++-------------------------
+file changed, 74 insertions(+), 71 deletions(-)
-files changed, 49 insertions(+), 58 deletions(-)
- create mode 100644 tcg/riscv/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 diff --git a/tcg/riscv/tcg-target-constr.h b/tcg/riscv/tcg-target-constr.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/riscv/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * RISC-V target-specific operand constaints.
 + * Copyright (c) 2020 Linaro
 + */
 +
 +C_O0_I1(r)
 +C_O0_I2(LZ, L)
 +C_O0_I2(rZ, r)
 +C_O0_I2(rZ, rZ)
 +C_O0_I3(LZ, L, L)
 +C_O0_I3(LZ, LZ, L)
 +C_O0_I4(LZ, LZ, L, L)
 +C_O0_I4(rZ, rZ, rZ, rZ)
 +C_O1_I1(r, L)
 +C_O1_I1(r, r)
 +C_O1_I2(r, L, L)
 +C_O1_I2(r, r, ri)
 +C_O1_I2(r, r, rI)
 +C_O1_I2(r, rZ, rN)
 +C_O1_I2(r, rZ, rZ)
 +C_O1_I4(r, rZ, rZ, rZ, rZ)
 +C_O2_I1(r, r, L)
 +C_O2_I2(r, r, L, L)
 +C_O2_I4(r, r, rZ, rZ, rM, rM)
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/riscv/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/riscv/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
+@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
      }
  }
-+/* Define all constraint sets. */
+-/* Return 2 if the condition can't be simplified, and the result
-+#include "../tcg-constr.c.inc"
+-   of the condition (0 or 1) if it can */
-+
+-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+-                                       TCGArg y, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +                                    TCGArg y, TCGCond c)
  {
--    static const TCGTargetOpDef r
+     uint64_t xv = arg_info(x)->val;
--        = { .args_ct_str = { "r" } };
+     uint64_t yv = arg_info(y)->val;
--    static const TCGTargetOpDef r_r
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
--        = { .args_ct_str = { "r", "r" } };
+         case TCG_COND_GEU:
--    static const TCGTargetOpDef rZ_r
+             return 1;
--        = { .args_ct_str = { "rZ", "r" } };
+         default:
--    static const TCGTargetOpDef rZ_rZ
+-            return 2;
--        = { .args_ct_str = { "rZ", "rZ" } };
++            return -1;
--    static const TCGTargetOpDef rZ_rZ_rZ_rZ
+         }
--        = { .args_ct_str = { "rZ", "rZ", "rZ", "rZ" } };
+     }
--    static const TCGTargetOpDef r_r_ri
+-    return 2;
--        = { .args_ct_str = { "r", "r", "ri" } };
++    return -1;
--    static const TCGTargetOpDef r_r_rI
+ }
--        = { .args_ct_str = { "r", "r", "rI" } };
--    static const TCGTargetOpDef r_rZ_rN
+-/* Return 2 if the condition can't be simplified, and the result
--        = { .args_ct_str = { "r", "rZ", "rN" } };
+-   of the condition (0 or 1) if it can */
--    static const TCGTargetOpDef r_rZ_rZ
+-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--        = { .args_ct_str = { "r", "rZ", "rZ" } };
++/*
--    static const TCGTargetOpDef r_rZ_rZ_rZ_rZ
++ * Return -1 if the condition can't be simplified,
--        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "rZ" } };
++ * and the result of the condition (0 or 1) if it can.
--    static const TCGTargetOpDef r_L
++ */
--        = { .args_ct_str = { "r", "L" } };
++static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--    static const TCGTargetOpDef r_r_L
+ {
--        = { .args_ct_str = { "r", "r", "L" } };
+     TCGArg al = p1[0], ah = p1[1];
--    static const TCGTargetOpDef r_L_L
+     TCGArg bl = p2[0], bh = p2[1];
--        = { .args_ct_str = { "r", "L", "L" } };
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--    static const TCGTargetOpDef r_r_L_L
+     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
--        = { .args_ct_str = { "r", "r", "L", "L" } };
+         return do_constant_folding_cond_eq(c);
--    static const TCGTargetOpDef LZ_L
+     }
--        = { .args_ct_str = { "LZ", "L" } };
+-    return 2;
--    static const TCGTargetOpDef LZ_L_L
++    return -1;
--        = { .args_ct_str = { "LZ", "L", "L" } };
+ }
--    static const TCGTargetOpDef LZ_LZ_L
--        = { .args_ct_str = { "LZ", "LZ", "L" } };
+ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
--    static const TCGTargetOpDef LZ_LZ_L_L
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        = { .args_ct_str = { "LZ", "LZ", "L", "L" } };
+             break;
--    static const TCGTargetOpDef r_r_rZ_rZ_rM_rM
--        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rM", "rM" } };
+         CASE_OP_32_64(setcond):
--
+-            tmp = do_constant_folding_cond(opc, op->args[1],
-     switch (op) {
+-                                           op->args[2], op->args[3]);
-     case INDEX_op_goto_ptr:
+-            if (tmp != 2) {
--        return &r;
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-+        return C_O0_I1(r);
++            i = do_constant_folding_cond(opc, op->args[1],
++                                         op->args[2], op->args[3]);
-     case INDEX_op_ld8u_i32:
++            if (i >= 0) {
-     case INDEX_op_ld8s_i32:
++                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+                 continue;
-     case INDEX_op_extrl_i64_i32:
+             }
-     case INDEX_op_extrh_i64_i32:
+             break;
-     case INDEX_op_ext_i32_i64:
--        return &r_r;
+         CASE_OP_32_64(brcond):
-+        return C_O1_I1(r, r);
+-            tmp = do_constant_folding_cond(opc, op->args[0],
+-                                           op->args[1], op->args[2]);
-     case INDEX_op_st8_i32:
+-            switch (tmp) {
-     case INDEX_op_st16_i32:
+-            case 0:
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++            i = do_constant_folding_cond(opc, op->args[0],
-     case INDEX_op_st16_i64:
++                                         op->args[1], op->args[2]);
-     case INDEX_op_st32_i64:
++            if (i == 0) {
-     case INDEX_op_st_i64:
+                 tcg_op_remove(s, op);
--        return &rZ_r;
+                 continue;
-+        return C_O0_I2(rZ, r);
+-            case 1:
++            } else if (i > 0) {
-     case INDEX_op_add_i32:
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-     case INDEX_op_and_i32:
+                 op->opc = opc = INDEX_op_br;
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+                 op->args[0] = op->args[3];
-     case INDEX_op_and_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     case INDEX_op_or_i64:
+             break;
-     case INDEX_op_xor_i64:
--        return &r_r_rI;
+         CASE_OP_32_64(movcond):
-+        return C_O1_I2(r, r, rI);
+-            tmp = do_constant_folding_cond(opc, op->args[1],
+-                                           op->args[2], op->args[5]);
-     case INDEX_op_sub_i32:
+-            if (tmp != 2) {
-     case INDEX_op_sub_i64:
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
--        return &r_rZ_rN;
++            i = do_constant_folding_cond(opc, op->args[1],
-+        return C_O1_I2(r, rZ, rN);
++                                         op->args[2], op->args[5]);
++            if (i >= 0) {
-     case INDEX_op_mul_i32:
++                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-     case INDEX_op_mulsh_i32:
+                 continue;
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+             }
-     case INDEX_op_rem_i64:
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-     case INDEX_op_remu_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     case INDEX_op_setcond_i64:
+             break;
--        return &r_rZ_rZ;
-+        return C_O1_I2(r, rZ, rZ);
+         case INDEX_op_brcond2_i32:
+-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-     case INDEX_op_shl_i32:
+-                                            op->args[4]);
-     case INDEX_op_shr_i32:
+-            if (tmp == 0) {
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-     case INDEX_op_shl_i64:
++                                          op->args[4]);
-     case INDEX_op_shr_i64:
++            if (i == 0) {
-     case INDEX_op_sar_i64:
+             do_brcond_false:
--        return &r_r_ri;
+                 tcg_op_remove(s, op);
-+        return C_O1_I2(r, r, ri);
+                 continue;
+             }
-     case INDEX_op_brcond_i32:
+-            if (tmp == 1) {
-     case INDEX_op_brcond_i64:
++            if (i > 0) {
--        return &rZ_rZ;
+             do_brcond_true:
-+        return C_O0_I2(rZ, rZ);
+                 op->opc = opc = INDEX_op_br;
+                 op->args[0] = op->args[5];
-     case INDEX_op_add2_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     case INDEX_op_add2_i64:
+             if (op->args[4] == TCG_COND_EQ) {
-     case INDEX_op_sub2_i32:
+                 /* Simplify EQ comparisons where one of the pairs
-     case INDEX_op_sub2_i64:
+                    can be simplified.  */
--        return &r_r_rZ_rZ_rM_rM;
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-+        return C_O2_I4(r, r, rZ, rZ, rM, rM);
+-                                               op->args[0], op->args[2],
+-                                               TCG_COND_EQ);
-     case INDEX_op_brcond2_i32:
+-                if (tmp == 0) {
--        return &rZ_rZ_rZ_rZ;
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-+        return C_O0_I4(rZ, rZ, rZ, rZ);
++                                             op->args[0], op->args[2],
++                                             TCG_COND_EQ);
-     case INDEX_op_setcond2_i32:
++                if (i == 0) {
--        return &r_rZ_rZ_rZ_rZ;
+                     goto do_brcond_false;
-+        return C_O1_I4(r, rZ, rZ, rZ, rZ);
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
-     case INDEX_op_qemu_ld_i32:
+                     goto do_brcond_high;
--        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
+                 }
-+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-+                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
+-                                               op->args[1], op->args[3],
-     case INDEX_op_qemu_st_i32:
+-                                               TCG_COND_EQ);
--        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &LZ_L : &LZ_L_L;
+-                if (tmp == 0) {
-+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-+                ? C_O0_I2(LZ, L) : C_O0_I3(LZ, L, L));
++                                             op->args[1], op->args[3],
-     case INDEX_op_qemu_ld_i64:
++                                             TCG_COND_EQ);
--        return TCG_TARGET_REG_BITS == 64 ? &r_L
++                if (i == 0) {
--               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
+                     goto do_brcond_false;
--               : &r_r_L_L;
+-                } else if (tmp != 1) {
-+        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
++                } else if (i < 0) {
-+               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
+                     break;
-+               : C_O2_I2(r, r, L, L));
+                 }
-     case INDEX_op_qemu_st_i64:
+             do_brcond_low:
--        return TCG_TARGET_REG_BITS == 64 ? &LZ_L
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &LZ_LZ_L
+             if (op->args[4] == TCG_COND_NE) {
--               : &LZ_LZ_L_L;
+                 /* Simplify NE comparisons where one of the pairs
-+        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(LZ, L)
+                    can be simplified.  */
-+               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(LZ, LZ, L)
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-+               : C_O0_I4(LZ, LZ, L, L));
+-                                               op->args[0], op->args[2],
+-                                               TCG_COND_NE);
-     default:
+-                if (tmp == 0) {
-         return NULL;
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
              }
              break;
          case INDEX_op_setcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                            op->args[5]);
 -            if (tmp != 2) {
 +            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 +                                          op->args[5]);
 +            if (i >= 0) {
              do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_const;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
              }
 --
 .25.1

-[PATCH 25/43] tcg: Use tcg_constant_{i32,i64} with tcg plugins
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
+This will allow callers to tail call to these functions
+and return true indicating processing complete.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/plugin-gen.c | 49 +++++++++++++++++++-----------------------
+ tcg/optimize.c | 9 +++++----
-file changed, 22 insertions(+), 27 deletions(-)
+file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/plugin-gen.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/plugin-gen.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-     if (TCG_TARGET_REG_BITS == 32) {
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
          /* mov_i32 */
          op = copy_op(begin_op, op, INDEX_op_mov_i32);
 -        /* movi_i32 */
 -        op = copy_op(begin_op, op, INDEX_op_movi_i32);
 +        /* mov_i32 w/ $0 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i32);
      } else {
          /* extu_i32_i64 */
          op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
      return op;
  }
--static TCGOp *copy_movi_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--{
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 -    if (TCG_TARGET_REG_BITS == 32) {
 -        /* 2x movi_i32 */
 -        op = copy_op(begin_op, op, INDEX_op_movi_i32);
 -        op->args[1] = v;
 -
 -        op = copy_op(begin_op, op, INDEX_op_movi_i32);
 -        op->args[1] = v >> 32;
 -    } else {
 -        /* movi_i64 */
 -        op = copy_op(begin_op, op, INDEX_op_movi_i64);
 -        op->args[1] = v;
 -    }
 -    return op;
 -}
 -
  static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
  {
-     if (UINTPTR_MAX == UINT32_MAX) {
+     TCGTemp *dst_ts = arg_temp(dst);
--        /* movi_i32 */
+     TCGTemp *src_ts = arg_temp(src);
--        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--        op->args[1] = (uintptr_t)ptr;
-+        /* mov_i32 */
+     if (ts_are_copies(dst_ts, src_ts)) {
-+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+         tcg_op_remove(ctx->tcg, op);
-+        op->args[1] = tcgv_i32_arg(tcg_constant_i32((uintptr_t)ptr));
+-        return;
-     } else {
++        return true;
 -        /* movi_i64 */
 -        op = copy_movi_i64(begin_op, op, (uint64_t)(uintptr_t)ptr);
 +        /* mov_i64 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i64);
 +        op->args[1] = tcgv_i64_arg(tcg_constant_i64((uintptr_t)ptr));
      }
-     return op;
      reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
          di->is_const = si->is_const;
          di->val = si->val;
      }
 +    return true;
  }
- static TCGOp *copy_const_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
 +static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
--    return copy_movi_i64(begin_op, op, v);
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-+    if (TCG_TARGET_REG_BITS == 32) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-+        /* 2x mov_i32 */
+     /* Convert movi to mov with constant temp. */
-+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+     tv = tcg_constant_internal(type, val);
-+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v));
+     init_ts_info(ctx, tv);
-+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v >> 32));
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 +    } else {
 +        /* mov_i64 */
 +        op = copy_op(begin_op, op, INDEX_op_mov_i64);
 +        op->args[1] = tcgv_i64_arg(tcg_constant_i64(v));
 +    }
 +    return op;
  }
- static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
      tcg_debug_assert(type == PLUGIN_GEN_CB_MEM);
 -    /* const_i32 == movi_i32 ("info", so it remains as is) */
 -    op = copy_op(&begin_op, op, INDEX_op_movi_i32);
 +    /* const_i32 == mov_i32 ("info", so it remains as is) */
 +    op = copy_op(&begin_op, op, INDEX_op_mov_i32);
      /* const_ptr */
      op = copy_const_ptr(&begin_op, op, cb->userp);
 --
 .25.1

-[PATCH 29/43] tcg: Add tcg_reg_alloc_dup2
+[PULL 17/56] tcg/optimize: Split out finish_folding
-There are several ways we can expand a vector dup of a 64-bit
+Copy z_mask into OptContext, for writeback to the
-element on a 32-bit host.
+first output within the new function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
-file changed, 97 insertions(+)
+file changed, 33 insertions(+), 16 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      TCGContext *tcg;
      TCGOp *prev_mb;
      TCGTempSet temps_used;
 +
 +    /* In flight values from optimization. */
 +    uint64_t z_mask;
  } OptContext;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
      }
  }
-+static void tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
++static void finish_folding(OptContext *ctx, TCGOp *op)
 +{
-+    const TCGLifeData arg_life = op->life;
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
-+    TCGTemp *ots, *itsl, *itsh;
++    int i, nb_oargs;
 +    TCGType vtype = TCGOP_VECL(op) + TCG_TYPE_V64;
 +
-+    /* This opcode is only valid for 32-bit hosts, for 64-bit elements. */
++    /*
-+    tcg_debug_assert(TCG_TARGET_REG_BITS == 32);
++     * For an opcode that ends a BB, reset all temp data.
-+    tcg_debug_assert(TCGOP_VECE(op) == MO_64);
++     * We do no cross-BB optimization.
-+
++     */
-+    ots = arg_temp(op->args[0]);
++    if (def->flags & TCG_OPF_BB_END) {
-+    itsl = arg_temp(op->args[1]);
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
-+    itsh = arg_temp(op->args[2]);
++        ctx->prev_mb = NULL;
-+
++        return;
 +    /* ENV should not be modified.  */
 +    tcg_debug_assert(!temp_readonly(ots));
 +
 +    /* Allocate the output register now.  */
 +    if (ots->val_type != TEMP_VAL_REG) {
 +        TCGRegSet allocated_regs = s->reserved_regs;
 +        TCGRegSet dup_out_regs =
 +            tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
 +
 +        /* Make sure to not spill the input registers. */
 +        if (!IS_DEAD_ARG(1) && itsl->val_type == TEMP_VAL_REG) {
 +            tcg_regset_set_reg(allocated_regs, itsl->reg);
 +        }
 +        if (!IS_DEAD_ARG(2) && itsh->val_type == TEMP_VAL_REG) {
 +            tcg_regset_set_reg(allocated_regs, itsh->reg);
 +        }
 +
 +        ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
 +                                 op->output_pref[0], ots->indirect_base);
 +        ots->val_type = TEMP_VAL_REG;
 +        ots->mem_coherent = 0;
 +        s->reg_to_temp[ots->reg] = ots;
 +    }
 +
-+    /* Promote dup2 of immediates to dupi_vec. */
++    nb_oargs = def->nb_oargs;
-+    if (itsl->val_type == TEMP_VAL_CONST && itsh->val_type == TEMP_VAL_CONST) {
++    for (i = 0; i < nb_oargs; i++) {
-+        uint64_t val = deposit64(itsl->val, 32, 32, itsh->val);
++        reset_temp(op->args[i]);
-+        MemOp vece = MO_64;
++        /*
-+
++         * Save the corresponding known-zero bits mask for the
-+        if (val == dup_const(MO_8, val)) {
++         * first output argument (only one supported so far).
-+            vece = MO_8;
++         */
-+        } else if (val == dup_const(MO_16, val)) {
++        if (i == 0) {
-+            vece = MO_16;
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +        } else if (val == dup_const(MO_32, val)) {
 +            vece = MO_32;
 +        }
-+
-+        tcg_out_dupi_vec(s, vtype, vece, ots->reg, val);
-+        goto done;
-+    }
-+
-+    /* If the two inputs form one 64-bit value, try dupm_vec. */
-+    if (itsl + 1 == itsh &&
-+        itsl->base_type == TCG_TYPE_I64 &&
-+        itsh->base_type == TCG_TYPE_I64) {
-+        if (!itsl->mem_coherent) {
-+            temp_sync(s, itsl, s->reserved_regs, 0, 0);
-+        }
-+        if (!itsl->mem_coherent) {
-+            temp_sync(s, itsl, s->reserved_regs, 0, 0);
-+        }
-+#ifdef HOST_WORDS_BIGENDIAN
-+        TCGTemp *its = itsh;
-+#else
-+        TCGTemp *its = itsl;
-+#endif
-+        if (tcg_out_dupm_vec(s, vtype, MO_64, ots->reg,
-+                             its->mem_base->reg, its->mem_offset)) {
-+            goto done;
-+        }
-+    }
-+
-+    /* Fall back to generic expansion. */
-+    tcg_reg_alloc_op(s, op);
-+    return;
-+
-+ done:
-+    if (IS_DEAD_ARG(1)) {
-+        temp_dead(s, itsl);
-+    }
-+    if (IS_DEAD_ARG(2)) {
-+        temp_dead(s, itsh);
-+    }
-+    if (NEED_SYNC_ARG(0)) {
-+        temp_sync(s, ots, s->reserved_regs, 0, IS_DEAD_ARG(0));
-+    } else if (IS_DEAD_ARG(0)) {
-+        temp_dead(s, ots);
 +    }
 +}
 +
- #ifdef TCG_TARGET_STACK_GROWSUP
+ static bool fold_call(OptContext *ctx, TCGOp *op)
- #define STACK_DIR(x) (-(x))
+ {
- #else
+     TCGContext *s = ctx->tcg;
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         case INDEX_op_dup_vec:
+             partmask &= 0xffffffffu;
-             tcg_reg_alloc_dup(s, op);
+             affected &= 0xffffffffu;
          }
 +        ctx.z_mask = z_mask;
          if (partmask == 0) {
              tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
-+        case INDEX_op_dup2_vec:
+         }
-+            tcg_reg_alloc_dup2(s, op);
-+            break;
+-        /* Some of the folding above can change opc. */
-         case INDEX_op_insn_start:
+-        opc = op->opc;
-             if (num_insns >= 0) {
+-        def = &tcg_op_defs[opc];
-                 size_t off = tcg_current_code_size(s);
+-        if (def->flags & TCG_OPF_BB_END) {
 -            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
 -        } else {
 -            int nb_oargs = def->nb_oargs;
 -            for (i = 0; i < nb_oargs; i++) {
 -                reset_temp(op->args[i]);
 -                /* Save the corresponding known-zero bits mask for the
 -                   first output argument (only one supported so far). */
 -                if (i == 0) {
 -                    arg_info(op->args[i])->z_mask = z_mask;
 -                }
 -            }
 -        }
 +        finish_folding(&ctx, op);
          /* Eliminate duplicate and redundant fence instructions.  */
          if (ctx.prev_mb) {
 --
 .25.1

-[PATCH 30/43] tcg/i386: Use tcg_constant_vec with tcg vec expanders
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/i386/tcg-target.c.inc | 26 +++++++++++++-------------
+ tcg/optimize.c | 9 ++++++---
-file changed, 13 insertions(+), 13 deletions(-)
+file changed, 6 insertions(+), 3 deletions(-)
-diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/i386/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/i386/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- static void expand_vec_mul(TCGType type, unsigned vece,
+         uint64_t z_mask, partmask, affected, tmp;
-                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
+         TCGOpcode opc = op->opc;
- {
+         const TCGOpDef *def;
--    TCGv_vec t1, t2, t3, t4;
++        bool done = false;
-+    TCGv_vec t1, t2, t3, t4, zero;
+         /* Calls are special. */
-     tcg_debug_assert(vece == MO_8);
+         if (opc == INDEX_op_call) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
+            allocator where needed and possible.  Also detect copies. */
-     case TCG_TYPE_V64:
+         switch (opc) {
-         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+         CASE_OP_32_64_VEC(mov):
-         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
--        tcg_gen_dup16i_vec(t2, 0);
+-            continue;
-+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
++            break;
--                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
-+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
+         case INDEX_op_dup_vec:
-         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+             if (arg_is_const(op->args[1])) {
--                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
+             break;
-         tcg_gen_mul_vec(MO_16, t1, t1, t2);
+         }
-         tcg_gen_shri_vec(MO_16, t1, t1, 8);
-         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+-        finish_folding(&ctx, op);
-@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
++        if (!done) {
-         t2 = tcg_temp_new_vec(type);
++            finish_folding(&ctx, op);
-         t3 = tcg_temp_new_vec(type);
++        }
-         t4 = tcg_temp_new_vec(type);
--        tcg_gen_dup16i_vec(t4, 0);
+         /* Eliminate duplicate and redundant fence instructions.  */
-+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
+         if (ctx.prev_mb) {
          vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
 -                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
 +                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
          vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
 -                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
 +                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
          vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
 -                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
 +                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
          vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
 -                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
 +                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
          tcg_gen_mul_vec(MO_16, t1, t1, t2);
          tcg_gen_mul_vec(MO_16, t3, t3, t4);
          tcg_gen_shri_vec(MO_16, t1, t1, 8);
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
          NEED_UMIN = 8,
          NEED_UMAX = 16,
      };
 -    TCGv_vec t1, t2;
 +    TCGv_vec t1, t2, t3;
      uint8_t fixup;
      switch (cond) {
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
      } else if (fixup & NEED_BIAS) {
          t1 = tcg_temp_new_vec(type);
          t2 = tcg_temp_new_vec(type);
 -        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
 -        tcg_gen_sub_vec(vece, t1, v1, t2);
 -        tcg_gen_sub_vec(vece, t2, v2, t2);
 +        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
 +        tcg_gen_sub_vec(vece, t1, v1, t3);
 +        tcg_gen_sub_vec(vece, t2, v2, t3);
          v1 = t1;
          v2 = t2;
          cond = tcg_signed_cond(cond);
 --
 .25.1

-New patch
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+This puts the separate mb optimization into the same framework
+as the others.  While fold_qemu_{ld,st} are currently identical,
+that won't last as more code gets moved.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
+file changed, 51 insertions(+), 38 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mb(OptContext *ctx, TCGOp *op)
++{
++    /* Eliminate duplicate and redundant fence instructions.  */
++    if (ctx->prev_mb) {
++        /*
++         * Merge two barriers of the same type into one,
++         * or a weaker barrier into a stronger one,
++         * or two weaker barriers into a stronger one.
++         *   mb X; mb Y => mb X|Y
++         *   mb; strl => mb; st
++         *   ldaq; mb => ld; mb
++         *   ldaq; strl => ld; mb; st
++         * Other combinations are also merged into a strong
++         * barrier.  This is stricter than specified but for
++         * the purposes of TCG is better than not optimizing.
++         */
++        ctx->prev_mb->args[0] |= op->args[0];
++        tcg_op_remove(ctx->tcg, op);
++    } else {
++        ctx->prev_mb = op;
++    }
++    return true;
++}
++
++static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
++static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
++        case INDEX_op_mb:
++            done = fold_mb(&ctx, op);
++            break;
++        case INDEX_op_qemu_ld_i32:
++        case INDEX_op_qemu_ld_i64:
++            done = fold_qemu_ld(&ctx, op);
++            break;
++        case INDEX_op_qemu_st_i32:
++        case INDEX_op_qemu_st8_i32:
++        case INDEX_op_qemu_st_i64:
++            done = fold_qemu_st(&ctx, op);
++            break;
++
+         default:
+             break;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (!done) {
+             finish_folding(&ctx, op);
+         }
+-
+-        /* Eliminate duplicate and redundant fence instructions.  */
+-        if (ctx.prev_mb) {
+-            switch (opc) {
+-            case INDEX_op_mb:
+-                /* Merge two barriers of the same type into one,
+-                 * or a weaker barrier into a stronger one,
+-                 * or two weaker barriers into a stronger one.
+-                 *   mb X; mb Y => mb X|Y
+-                 *   mb; strl => mb; st
+-                 *   ldaq; mb => ld; mb
+-                 *   ldaq; strl => ld; mb; st
+-                 * Other combinations are also merged into a strong
+-                 * barrier.  This is stricter than specified but for
+-                 * the purposes of TCG is better than not optimizing.
+-                 */
+-                ctx.prev_mb->args[0] |= op->args[0];
+-                tcg_op_remove(s, op);
+-                break;
+-
+-            default:
+-                /* Opcodes that end the block stop the optimization.  */
+-                if ((def->flags & TCG_OPF_BB_END) == 0) {
+-                    break;
+-                }
+-                /* fallthru */
+-            case INDEX_op_qemu_ld_i32:
+-            case INDEX_op_qemu_ld_i64:
+-            case INDEX_op_qemu_st_i32:
+-            case INDEX_op_qemu_st8_i32:
+-            case INDEX_op_qemu_st_i64:
+-                /* Opcodes that touch guest memory stop the optimization.  */
+-                ctx.prev_mb = NULL;
+-                break;
+-            }
+-        } else if (opc == INDEX_op_mb) {
+-            ctx.prev_mb = op;
+-        }
+     }
+ }
+--
+.25.1

-[PATCH 26/43] tcg: Use tcg_constant_{i32, i64, vec} with gvec expanders
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
+Split out a whole bunch of placeholder functions, which are
+currently identical.  That won't last as more code gets moved.
+Use CASE_32_64_VEC for some logical operators that previously
+missed the addition of vectors.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |   1 +
+ tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
- tcg/tcg-op-gvec.c | 125 ++++++++++++++++++----------------------------
+file changed, 219 insertions(+), 52 deletions(-)
- tcg/tcg.c         |   8 +++
-files changed, 58 insertions(+), 76 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline TCGv_i64 tcg_constant_i64(int64_t val)
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      }
  }
- TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
++/*
-+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val);
++ * The fold_* functions return true when processing is complete,
++ * usually by folding the operation to a constant or to a copy,
- #if UINTPTR_MAX == UINT32_MAX
++ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
- # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
++ * like collect information about the value produced, for use in
-diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
++ * optimizing a subsequent operation.
-index XXXXXXX..XXXXXXX 100644
++ *
---- a/tcg/tcg-op-gvec.c
++ * These first fold_* functions are all helpers, used by other
-+++ b/tcg/tcg-op-gvec.c
++ * folders for more specific operations.
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
++ */
-                         gen_helper_gvec_2 *fn)
++
 +static bool fold_const1(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = do_constant_folding(op->opc, t, 0);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
 +static bool fold_const2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t1 = arg_info(op->args[1])->val;
 +        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = do_constant_folding(op->opc, t1, t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
 +/*
 + * These outermost fold_<op> functions are sorted alphabetically.
 + */
 +
 +static bool fold_add(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
-     TCGv_ptr a0, a1;
+     TCGContext *s = ctx->tcg;
--    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
-+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
+     return true;
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
 -    tcg_temp_free_i32(desc);
  }
- /* Generate a call to a gvec-style helper with two vector operands
++static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
++{
-                          gen_helper_gvec_2i *fn)
++    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_divide(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_eqv(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_exts(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_extu(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
  static bool fold_mb(OptContext *ctx, TCGOp *op)
  {
-     TCGv_ptr a0, a1;
+     /* Eliminate duplicate and redundant fence instructions.  */
--    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
+     return true;
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
 -    tcg_temp_free_i32(desc);
  }
- /* Generate a call to a gvec-style helper with three vector operands.  */
++static bool fold_mul(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
++{
-                         gen_helper_gvec_3 *fn)
++    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_nand(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_neg(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
-     TCGv_ptr a0, a1, a2;
+     /* Opcodes that touch guest memory stop the mb optimization.  */
--    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
-+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
+     return false;
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
 -    tcg_temp_free_i32(desc);
  }
- /* Generate a call to a gvec-style helper with four vector operands.  */
++static bool fold_remainder(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
++{
-                         int32_t data, gen_helper_gvec_4 *fn)
++    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_shift(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_sub(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_xor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
-     TCGv_ptr a0, a1, a2, a3;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with five vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
  {
      TCGv_ptr a0, a1, a2, a3, a4;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
      tcg_temp_free_ptr(a4);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
                          int32_t data, gen_helper_gvec_2_ptr *fn)
  {
      TCGv_ptr a0, a1;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          int32_t data, gen_helper_gvec_3_ptr *fn)
  {
      TCGv_ptr a0, a1, a2;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a0);
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with four vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          gen_helper_gvec_4_ptr *fn)
  {
      TCGv_ptr a0, a1, a2, a3;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a1);
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
 -    tcg_temp_free_i32(desc);
  }
  /* Generate a call to a gvec-style helper with five vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                          gen_helper_gvec_5_ptr *fn)
  {
      TCGv_ptr a0, a1, a2, a3, a4;
 -    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
 +    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
      a0 = tcg_temp_new_ptr();
      a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
      tcg_temp_free_ptr(a2);
      tcg_temp_free_ptr(a3);
      tcg_temp_free_ptr(a4);
 -    tcg_temp_free_i32(desc);
  }
  /* Return true if we want to implement something of OPRSZ bytes
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
                  || (TCG_TARGET_REG_BITS == 64
                      && (in_c == 0 || in_c == -1
                          || !check_size_impl(oprsz, 4)))) {
 -                t_64 = tcg_const_i64(in_c);
 +                t_64 = tcg_constant_i64(in_c);
              } else {
 -                t_32 = tcg_const_i32(in_c);
 +                t_32 = tcg_constant_i32(in_c);
              }
+             break;
+-        CASE_OP_32_64(not):
+-        CASE_OP_32_64(neg):
+-        CASE_OP_32_64(ext8s):
+-        CASE_OP_32_64(ext8u):
+-        CASE_OP_32_64(ext16s):
+-        CASE_OP_32_64(ext16u):
+-        CASE_OP_32_64(ctpop):
+-        case INDEX_op_ext32s_i64:
+-        case INDEX_op_ext32u_i64:
+-        case INDEX_op_ext_i32_i64:
+-        case INDEX_op_extu_i32_i64:
+-        case INDEX_op_extrl_i64_i32:
+-        case INDEX_op_extrh_i64_i32:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+         case INDEX_op_bswap64_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(add):
+-        CASE_OP_32_64(sub):
+-        CASE_OP_32_64(mul):
+-        CASE_OP_32_64(or):
+-        CASE_OP_32_64(and):
+-        CASE_OP_32_64(xor):
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
+-        CASE_OP_32_64(sar):
+-        CASE_OP_32_64(rotl):
+-        CASE_OP_32_64(rotr):
+-        CASE_OP_32_64(andc):
+-        CASE_OP_32_64(orc):
+-        CASE_OP_32_64(eqv):
+-        CASE_OP_32_64(nand):
+-        CASE_OP_32_64(nor):
+-        CASE_OP_32_64(muluh):
+-        CASE_OP_32_64(mulsh):
+-        CASE_OP_32_64(div):
+-        CASE_OP_32_64(divu):
+-        CASE_OP_32_64(rem):
+-        CASE_OP_32_64(remu):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
++        default:
++            break;
++
++        /* ---------------------------------------------------------- */
++        /* Sorted alphabetically by opcode as much as possible. */
++
++        CASE_OP_32_64_VEC(add):
++            done = fold_add(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(and):
++            done = fold_and(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(andc):
++            done = fold_andc(&ctx, op);
++            break;
++        CASE_OP_32_64(ctpop):
++            done = fold_ctpop(&ctx, op);
++            break;
++        CASE_OP_32_64(div):
++        CASE_OP_32_64(divu):
++            done = fold_divide(&ctx, op);
++            break;
++        CASE_OP_32_64(eqv):
++            done = fold_eqv(&ctx, op);
++            break;
++        CASE_OP_32_64(ext8s):
++        CASE_OP_32_64(ext16s):
++        case INDEX_op_ext32s_i64:
++        case INDEX_op_ext_i32_i64:
++            done = fold_exts(&ctx, op);
++            break;
++        CASE_OP_32_64(ext8u):
++        CASE_OP_32_64(ext16u):
++        case INDEX_op_ext32u_i64:
++        case INDEX_op_extu_i32_i64:
++        case INDEX_op_extrl_i64_i32:
++        case INDEX_op_extrh_i64_i32:
++            done = fold_extu(&ctx, op);
++            break;
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(mul):
++            done = fold_mul(&ctx, op);
++            break;
++        CASE_OP_32_64(mulsh):
++        CASE_OP_32_64(muluh):
++            done = fold_mul_highpart(&ctx, op);
++            break;
++        CASE_OP_32_64(nand):
++            done = fold_nand(&ctx, op);
++            break;
++        CASE_OP_32_64(neg):
++            done = fold_neg(&ctx, op);
++            break;
++        CASE_OP_32_64(nor):
++            done = fold_nor(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(not):
++            done = fold_not(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(or):
++            done = fold_or(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(orc):
++            done = fold_orc(&ctx, op);
++            break;
+         case INDEX_op_qemu_ld_i32:
+         case INDEX_op_qemu_ld_i64:
+             done = fold_qemu_ld(&ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_qemu_st_i64:
+             done = fold_qemu_st(&ctx, op);
+             break;
+-
+-        default:
++        CASE_OP_32_64(rem):
++        CASE_OP_32_64(remu):
++            done = fold_remainder(&ctx, op);
++            break;
++        CASE_OP_32_64(rotl):
++        CASE_OP_32_64(rotr):
++        CASE_OP_32_64(sar):
++        CASE_OP_32_64(shl):
++        CASE_OP_32_64(shr):
++            done = fold_shift(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(sub):
++            done = fold_sub(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(xor):
++            done = fold_xor(&ctx, op);
+             break;
          }
-@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
-     /* Otherwise implement out of line.  */
-     t_ptr = tcg_temp_new_ptr();
-     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
--    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
-+    t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
-     if (vece == MO_64) {
-         if (in_64) {
-             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
-         } else {
--            t_64 = tcg_const_i64(in_c);
-+            t_64 = tcg_constant_i64(in_c);
-             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
--            tcg_temp_free_i64(t_64);
-         }
-     } else {
-         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
-@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
-         if (in_32) {
-             fns[vece](t_ptr, t_desc, in_32);
--        } else {
-+        } else if (in_64) {
-             t_32 = tcg_temp_new_i32();
--            if (in_64) {
--                tcg_gen_extrl_i64_i32(t_32, in_64);
--            } else if (vece == MO_8) {
--                tcg_gen_movi_i32(t_32, in_c & 0xff);
--            } else if (vece == MO_16) {
--                tcg_gen_movi_i32(t_32, in_c & 0xffff);
--            } else {
--                tcg_gen_movi_i32(t_32, in_c);
--            }
-+            tcg_gen_extrl_i64_i32(t_32, in_64);
-             fns[vece](t_ptr, t_desc, t_32);
-             tcg_temp_free_i32(t_32);
-+        } else {
-+            if (vece == MO_8) {
-+                in_c &= 0xff;
-+            } else if (vece == MO_16) {
-+                in_c &= 0xffff;
-+            }
-+            t_32 = tcg_constant_i32(in_c);
-+            fns[vece](t_ptr, t_desc, t_32);
-         }
-     }
-     tcg_temp_free_ptr(t_ptr);
--    tcg_temp_free_i32(t_desc);
-     return;
-  done:
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
-             if (g->fno) {
-                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
-             } else {
--                TCGv_i64 tcg_c = tcg_const_i64(c);
-+                TCGv_i64 tcg_c = tcg_constant_i64(c);
-                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
-                                     maxsz, c, g->fnoi);
--                tcg_temp_free_i64(tcg_c);
-             }
-             oprsz = maxsz;
-         }
-@@ -XXX,XX +XXX,XX @@ static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
- void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
- {
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
-     gen_addv_mask(d, a, b, m);
--    tcg_temp_free_i64(m);
- }
- void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
- {
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
-     gen_addv_mask(d, a, b, m);
--    tcg_temp_free_i64(m);
- }
- void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
- void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t c, uint32_t oprsz, uint32_t maxsz)
- {
--    TCGv_i64 tmp = tcg_const_i64(c);
-+    TCGv_i64 tmp = tcg_constant_i64(c);
-     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
--    tcg_temp_free_i64(tmp);
- }
- static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
-@@ -XXX,XX +XXX,XX @@ static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
- void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
- {
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
-     gen_subv_mask(d, a, b, m);
--    tcg_temp_free_i64(m);
- }
- void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
- {
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
-     gen_subv_mask(d, a, b, m);
--    tcg_temp_free_i64(m);
- }
- void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
- void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t c, uint32_t oprsz, uint32_t maxsz)
- {
--    TCGv_i64 tmp = tcg_const_i64(c);
-+    TCGv_i64 tmp = tcg_constant_i64(c);
-     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
--    tcg_temp_free_i64(tmp);
- }
- void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
- static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
- {
--    TCGv_i32 max = tcg_const_i32(-1);
-+    TCGv_i32 max = tcg_constant_i32(-1);
-     tcg_gen_add_i32(d, a, b);
-     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
--    tcg_temp_free_i32(max);
- }
- static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
- {
--    TCGv_i64 max = tcg_const_i64(-1);
-+    TCGv_i64 max = tcg_constant_i64(-1);
-     tcg_gen_add_i64(d, a, b);
-     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
--    tcg_temp_free_i64(max);
- }
- void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
- static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
- {
--    TCGv_i32 min = tcg_const_i32(0);
-+    TCGv_i32 min = tcg_constant_i32(0);
-     tcg_gen_sub_i32(d, a, b);
-     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
--    tcg_temp_free_i32(min);
- }
- static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
- {
--    TCGv_i64 min = tcg_const_i64(0);
-+    TCGv_i64 min = tcg_constant_i64(0);
-     tcg_gen_sub_i64(d, a, b);
-     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
--    tcg_temp_free_i64(min);
- }
- void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
-@@ -XXX,XX +XXX,XX @@ static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
- void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
- {
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
-     gen_negv_mask(d, b, m);
--    tcg_temp_free_i64(m);
- }
- void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
- {
--    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
-+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
-     gen_negv_mask(d, b, m);
--    tcg_temp_free_i64(m);
- }
- void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
- void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t c, uint32_t oprsz, uint32_t maxsz)
- {
--    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
-+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
-     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
--    tcg_temp_free_i64(tmp);
- }
- static const GVecGen2s gop_xors = {
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
- void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
-                        int64_t c, uint32_t oprsz, uint32_t maxsz)
- {
--    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
-+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
-     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
--    tcg_temp_free_i64(tmp);
- }
- static const GVecGen2s gop_ors = {
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
- void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
-                       int64_t c, uint32_t oprsz, uint32_t maxsz)
- {
--    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
-+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
-     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
--    tcg_temp_free_i64(tmp);
- }
- void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
-                                  TCGv_vec a, TCGv_vec b)
- {
-     TCGv_vec t = tcg_temp_new_vec_matching(d);
-+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
--    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
--    tcg_gen_and_vec(vece, t, t, b);
-+    tcg_gen_and_vec(vece, t, b, m);
-     tcg_gen_shlv_vec(vece, d, a, t);
-     tcg_temp_free_vec(t);
- }
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
-                                  TCGv_vec a, TCGv_vec b)
- {
-     TCGv_vec t = tcg_temp_new_vec_matching(d);
-+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
--    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
--    tcg_gen_and_vec(vece, t, t, b);
-+    tcg_gen_and_vec(vece, t, b, m);
-     tcg_gen_shrv_vec(vece, d, a, t);
-     tcg_temp_free_vec(t);
- }
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
-                                  TCGv_vec a, TCGv_vec b)
- {
-     TCGv_vec t = tcg_temp_new_vec_matching(d);
-+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
--    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
--    tcg_gen_and_vec(vece, t, t, b);
-+    tcg_gen_and_vec(vece, t, b, m);
-     tcg_gen_sarv_vec(vece, d, a, t);
-     tcg_temp_free_vec(t);
- }
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
-                                   TCGv_vec a, TCGv_vec b)
- {
-     TCGv_vec t = tcg_temp_new_vec_matching(d);
-+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
--    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
--    tcg_gen_and_vec(vece, t, t, b);
-+    tcg_gen_and_vec(vece, t, b, m);
-     tcg_gen_rotlv_vec(vece, d, a, t);
-     tcg_temp_free_vec(t);
- }
-@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
-                                   TCGv_vec a, TCGv_vec b)
- {
-     TCGv_vec t = tcg_temp_new_vec_matching(d);
-+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
--    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
--    tcg_gen_and_vec(vece, t, t, b);
-+    tcg_gen_and_vec(vece, t, b, m);
-     tcg_gen_rotrv_vec(vece, d, a, t);
-     tcg_temp_free_vec(t);
- }
-diff --git a/tcg/tcg.c b/tcg/tcg.c
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
-+++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
-     return temp_tcgv_vec(tcg_constant_internal(type, val));
- }
-+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val)
-+{
-+    TCGTemp *t = tcgv_vec_temp(match);
-+
-+    tcg_debug_assert(t->temp_allocated != 0);
-+    return tcg_constant_vec(t->base_type, vece, val);
-+}
-+
- TCGv_i32 tcg_const_i32(int32_t val)
- {
-     TCGv_i32 t0;
 --
 .25.1

-New patch
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
+file changed, 72 insertions(+), 73 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_setcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
++            goto do_setcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            goto do_setcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
++                                     op->args[4], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            op->args[2] = op->args[3];
++            op->args[3] = cond;
++            op->opc = INDEX_op_setcond_i32;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_setcond_high:
++        op->args[1] = op->args[2];
++        op->args[2] = op->args[4];
++        op->args[3] = cond;
++        op->opc = INDEX_op_setcond_i32;
++        break;
++    }
++    return false;
++
++ do_setcond_const:
++    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_setcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+-                                          op->args[5]);
+-            if (i >= 0) {
+-            do_setcond_const:
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0
+-                 && arg_is_const(op->args[4])
+-                 && arg_info(op->args[4])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_setcond_high:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_const;
+-                } else if (i > 0) {
+-                    goto do_setcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_setcond_low:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[2] = op->args[3];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_low;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        case INDEX_op_setcond2_i32:
++            done = fold_setcond2(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
+file changed, 81 insertions(+), 78 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[4];
++    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
++    TCGArg label = op->args[5];
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_brcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
++            goto do_brcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
++                                     op->args[2], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            goto do_brcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            op->opc = INDEX_op_brcond_i32;
++            op->args[1] = op->args[2];
++            op->args[2] = cond;
++            op->args[3] = label;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_brcond_high:
++        op->opc = INDEX_op_brcond_i32;
++        op->args[0] = op->args[1];
++        op->args[1] = op->args[3];
++        op->args[2] = cond;
++        op->args[3] = label;
++        break;
++
++    do_brcond_const:
++        if (i == 0) {
++            tcg_op_remove(ctx->tcg, op);
++            return true;
++        }
++        op->opc = INDEX_op_br;
++        op->args[0] = label;
++        break;
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_brcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                          op->args[4]);
+-            if (i == 0) {
+-            do_brcond_false:
+-                tcg_op_remove(s, op);
+-                continue;
+-            }
+-            if (i > 0) {
+-            do_brcond_true:
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[5];
+-                break;
+-            }
+-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+-                 && arg_is_const(op->args[2])
+-                 && arg_info(op->args[2])->val == 0
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_brcond_high:
+-                op->opc = opc = INDEX_op_brcond_i32;
+-                op->args[0] = op->args[1];
+-                op->args[1] = op->args[3];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i > 0) {
+-                    goto do_brcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_brcond_low:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_high;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_low;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        case INDEX_op_brcond2_i32:
++            done = fold_brcond2(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
+file changed, 21 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
++        uint32_t a = arg_info(op->args[2])->val;
++        uint32_t b = arg_info(op->args[3])->val;
++        uint64_t r = (uint64_t)a * b;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++        return true;
++    }
++    return false;
++}
++
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_mulu2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-                uint32_t a = arg_info(op->args[2])->val;
+-                uint32_t b = arg_info(op->args[3])->val;
+-                uint64_t r = (uint64_t)a * b;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
++        case INDEX_op_mulu2_i32:
++            done = fold_mulu2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
+which will not be simple wrappers forever.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
+file changed, 44 insertions(+), 26 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
++        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
++        uint32_t al = arg_info(op->args[2])->val;
++        uint32_t ah = arg_info(op->args[3])->val;
++        uint32_t bl = arg_info(op->args[4])->val;
++        uint32_t bh = arg_info(op->args[5])->val;
++        uint64_t a = ((uint64_t)ah << 32) | al;
++        uint64_t b = ((uint64_t)bh << 32) | bl;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        if (add) {
++            a += b;
++        } else {
++            a -= b;
++        }
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
++        return true;
++    }
++    return false;
++}
++
++static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, true);
++}
++
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, false);
++}
++
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_add2_i32:
+-        case INDEX_op_sub2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+-                uint32_t al = arg_info(op->args[2])->val;
+-                uint32_t ah = arg_info(op->args[3])->val;
+-                uint32_t bl = arg_info(op->args[4])->val;
+-                uint32_t bh = arg_info(op->args[5])->val;
+-                uint64_t a = ((uint64_t)ah << 32) | al;
+-                uint64_t b = ((uint64_t)bh << 32) | bl;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                if (opc == INDEX_op_add2_i32) {
+-                    a += b;
+-                } else {
+-                    a -= b;
+-                }
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
+-                continue;
+-            }
+-            break;
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
++        case INDEX_op_add2_i32:
++            done = fold_add2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
++        case INDEX_op_sub2_i32:
++            done = fold_sub2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 27/56] tcg/optimize: Split out fold_movcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
+file changed, 31 insertions(+), 25 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_movcond(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode opc = op->opc;
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
++    }
++
++    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
++        uint64_t tv = arg_info(op->args[3])->val;
++        uint64_t fv = arg_info(op->args[4])->val;
++
++        opc = (opc == INDEX_op_movcond_i32
++               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
++
++        if (tv == 1 && fv == 0) {
++            op->opc = opc;
++            op->args[3] = cond;
++        } else if (fv == 1 && tv == 0) {
++            op->opc = opc;
++            op->args[3] = tcg_invert_cond(cond);
++        }
++    }
++    return false;
++}
++
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(movcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[5]);
+-            if (i >= 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
+-                continue;
+-            }
+-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-                uint64_t tv = arg_info(op->args[3])->val;
+-                uint64_t fv = arg_info(op->args[4])->val;
+-                TCGCond cond = op->args[5];
+-
+-                if (fv == 1 && tv == 0) {
+-                    cond = tcg_invert_cond(cond);
+-                } else if (!(tv == 1 && fv == 0)) {
+-                    break;
+-                }
+-                op->args[3] = cond;
+-                op->opc = opc = (opc == INDEX_op_movcond_i32
+-                                 ? INDEX_op_setcond_i32
+-                                 : INDEX_op_setcond_i64);
+-            }
+-            break;
+-
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(movcond):
++            done = fold_movcond(&ctx, op);
++            break;
+         CASE_OP_32_64(mul):
+             done = fold_mul(&ctx, op);
+             break;
+--
+.25.1

-[PATCH 33/43] tcg/aarch64: Use tcg_constant_vec with tcg vec expanders
+[PULL 28/56] tcg/optimize: Split out fold_extract2
-Improve rotrv_vec to reduce "t1 = -v2, t2 = t1 + c" to
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-"t1 = -v, t2 = c - v2".  This avoids a serial dependency
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 between t1 and t2.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.c.inc | 10 +++++-----
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
-file changed, 5 insertions(+), 5 deletions(-)
+file changed, 22 insertions(+), 17 deletions(-)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-                        TCGArg a0, ...)
+     return fold_const2(ctx, op);
  }
 +static bool fold_extract2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t v1 = arg_info(op->args[1])->val;
 +        uint64_t v2 = arg_info(op->args[2])->val;
 +        int shr = op->args[3];
 +
 +        if (op->opc == INDEX_op_extract2_i64) {
 +            v1 >>= shr;
 +            v2 <<= 64 - shr;
 +        } else {
 +            v1 = (uint32_t)v1 >> shr;
 +            v2 = (int32_t)v2 << (32 - shr);
 +        }
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
 +    }
 +    return false;
 +}
 +
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
-     va_list va;
+     return fold_const1(ctx, op);
--    TCGv_vec v0, v1, v2, t1, t2;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    TCGv_vec v0, v1, v2, t1, t2, c1;
+             }
-     TCGArg a2;
+             break;
-     va_start(va, a0);
+-        CASE_OP_32_64(extract2):
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                uint64_t v1 = arg_info(op->args[1])->val;
-     case INDEX_op_rotlv_vec:
+-                uint64_t v2 = arg_info(op->args[2])->val;
-         t1 = tcg_temp_new_vec(type);
+-                int shr = op->args[3];
--        tcg_gen_dupi_vec(vece, t1, 8 << vece);
+-
--        tcg_gen_sub_vec(vece, t1, v2, t1);
+-                if (opc == INDEX_op_extract2_i64) {
-+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-+        tcg_gen_sub_vec(vece, t1, v2, c1);
+-                } else {
-         /* Right shifts are negative left shifts for AArch64.  */
+-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+-                                    ((uint32_t)v2 << (32 - shr)));
-                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+-                }
-@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-     case INDEX_op_rotrv_vec:
+-                continue;
-         t1 = tcg_temp_new_vec(type);
+-            }
-         t2 = tcg_temp_new_vec(type);
+-            break;
-+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+-
-         tcg_gen_neg_vec(vece, t1, v2);
+         default:
--        tcg_gen_dupi_vec(vece, t2, 8 << vece);
+             break;
--        tcg_gen_add_vec(vece, t2, t1, t2);
-+        tcg_gen_sub_vec(vece, t2, c1, v2);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         /* Right shifts are negative left shifts for AArch64.  */
+         CASE_OP_32_64(eqv):
-         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
+             done = fold_eqv(&ctx, op);
-                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+             break;
 +        CASE_OP_32_64(extract2):
 +            done = fold_extract2(&ctx, op);
 +            break;
          CASE_OP_32_64(ext8s):
          CASE_OP_32_64(ext16s):
          case INDEX_op_ext32s_i64:
 --
 .25.1

-[PATCH 39/43] tcg/ppc: Convert to tcg-constr.c.inc
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/ppc/tcg-target-constr.h |  37 ++++++++++
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
- tcg/ppc/tcg-target.c.inc    | 135 +++++++++++++++---------------------
+file changed, 30 insertions(+), 18 deletions(-)
 files changed, 94 insertions(+), 78 deletions(-)
  create mode 100644 tcg/ppc/tcg-target-constr.h
-diff --git a/tcg/ppc/tcg-target-constr.h b/tcg/ppc/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-new file mode 100644
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/tcg/optimize.c
---- /dev/null
++++ b/tcg/optimize.c
-+++ b/tcg/ppc/tcg-target-constr.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@
+     return fold_const2(ctx, op);
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
+ }
-+/*
-+ * PowerPC target-specific operand constaints.
++static bool fold_extract(OptContext *ctx, TCGOp *op)
-+ * Copyright (c) 2020 Linaro
++{
-+ */
++    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
-+C_O0_I1(r)
++        t = arg_info(op->args[1])->val;
-+C_O0_I2(r, r)
++        t = extract64(t, op->args[2], op->args[3]);
-+C_O0_I2(r, ri)
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-+C_O0_I2(S, S)
++    }
-+C_O0_I2(v, r)
++    return false;
-+C_O0_I3(S, S, S)
++}
-+C_O0_I4(r, r, ri, ri)
++
-+C_O0_I4(S, S, S, S)
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
-+C_O1_I1(r, L)
+ {
-+C_O1_I1(r, r)
+     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-+C_O1_I1(v, r)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-+C_O1_I1(v, v)
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +C_O1_I1(v, vr)
 +C_O1_I2(r, 0, rZ)
 +C_O1_I2(r, L, L)
 +C_O1_I2(r, rI, ri)
 +C_O1_I2(r, rI, rT)
 +C_O1_I2(r, r, r)
 +C_O1_I2(r, r, ri)
 +C_O1_I2(r, r, rI)
 +C_O1_I2(r, r, rT)
 +C_O1_I2(r, r, rU)
 +C_O1_I2(r, r, rZW)
 +C_O1_I2(v, v, v)
 +C_O1_I3(v, v, v, v)
 +C_O1_I4(r, r, ri, rZ, rZ)
 +C_O1_I4(r, r, r, ri, ri)
 +C_O2_I1(L, L, L)
 +C_O2_I2(L, L, L, L)
 +C_O2_I4(r, r, rI, rZM, r, r)
 +C_O2_I4(r, r, r, r, rI, rZM)
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
      va_end(va);
  }
-+/* Define all constraint sets. */
++static bool fold_sextract(OptContext *ctx, TCGOp *op)
-+#include "../tcg-constr.c.inc"
++{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++        t = arg_info(op->args[1])->val;
 +        t = sextract64(t, op->args[2], op->args[3]);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
+     return fold_const2(ctx, op);
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
+             }
--    static const TCGTargetOpDef S_S = { .args_ct_str = { "S", "S" } };
+             break;
--    static const TCGTargetOpDef r_ri = { .args_ct_str = { "r", "ri" } };
--    static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
+-        CASE_OP_32_64(extract):
--    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
+-            if (arg_is_const(op->args[1])) {
--    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
+-                tmp = extract64(arg_info(op->args[1])->val,
--    static const TCGTargetOpDef S_S_S = { .args_ct_str = { "S", "S", "S" } };
+-                                op->args[2], op->args[3]);
--    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
--    static const TCGTargetOpDef r_r_rI = { .args_ct_str = { "r", "r", "rI" } };
+-                continue;
--    static const TCGTargetOpDef r_r_rT = { .args_ct_str = { "r", "r", "rT" } };
+-            }
--    static const TCGTargetOpDef r_r_rU = { .args_ct_str = { "r", "r", "rU" } };
+-            break;
 -    static const TCGTargetOpDef r_rI_ri
 -        = { .args_ct_str = { "r", "rI", "ri" } };
 -    static const TCGTargetOpDef r_rI_rT
 -        = { .args_ct_str = { "r", "rI", "rT" } };
 -    static const TCGTargetOpDef r_r_rZW
 -        = { .args_ct_str = { "r", "r", "rZW" } };
 -    static const TCGTargetOpDef L_L_L_L
 -        = { .args_ct_str = { "L", "L", "L", "L" } };
 -    static const TCGTargetOpDef S_S_S_S
 -        = { .args_ct_str = { "S", "S", "S", "S" } };
 -    static const TCGTargetOpDef movc
 -        = { .args_ct_str = { "r", "r", "ri", "rZ", "rZ" } };
 -    static const TCGTargetOpDef dep
 -        = { .args_ct_str = { "r", "0", "rZ" } };
 -    static const TCGTargetOpDef br2
 -        = { .args_ct_str = { "r", "r", "ri", "ri" } };
 -    static const TCGTargetOpDef setc2
 -        = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
 -    static const TCGTargetOpDef add2
 -        = { .args_ct_str = { "r", "r", "r", "r", "rI", "rZM" } };
 -    static const TCGTargetOpDef sub2
 -        = { .args_ct_str = { "r", "r", "rI", "rZM", "r", "r" } };
 -    static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
 -    static const TCGTargetOpDef v_vr = { .args_ct_str = { "v", "vr" } };
 -    static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
 -    static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v" } };
 -    static const TCGTargetOpDef v_v_v_v
 -        = { .args_ct_str = { "v", "v", "v", "v" } };
 -
-     switch (op) {
+-        CASE_OP_32_64(sextract):
-     case INDEX_op_goto_ptr:
+-            if (arg_is_const(op->args[1])) {
--        return &r;
+-                tmp = sextract64(arg_info(op->args[1])->val,
-+        return C_O0_I1(r);
+-                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-     case INDEX_op_ld8u_i32:
+-                continue;
-     case INDEX_op_ld8s_i32:
+-            }
-     case INDEX_op_ld16u_i32:
+-            break;
-     case INDEX_op_ld16s_i32:
+-
-     case INDEX_op_ld_i32:
+         default:
--    case INDEX_op_st8_i32:
+             break;
--    case INDEX_op_st16_i32:
--    case INDEX_op_st_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     case INDEX_op_ctpop_i32:
+         CASE_OP_32_64(eqv):
-     case INDEX_op_neg_i32:
+             done = fold_eqv(&ctx, op);
-     case INDEX_op_not_i32:
+             break;
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++        CASE_OP_32_64(extract):
-     case INDEX_op_ld32u_i64:
++            done = fold_extract(&ctx, op);
-     case INDEX_op_ld32s_i64:
++            break;
-     case INDEX_op_ld_i64:
+         CASE_OP_32_64(extract2):
--    case INDEX_op_st8_i64:
+             done = fold_extract2(&ctx, op);
--    case INDEX_op_st16_i64:
+             break;
--    case INDEX_op_st32_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    case INDEX_op_st_i64:
+         case INDEX_op_setcond2_i32:
-     case INDEX_op_ctpop_i64:
+             done = fold_setcond2(&ctx, op);
-     case INDEX_op_neg_i64:
+             break;
-     case INDEX_op_not_i64:
++        CASE_OP_32_64(sextract):
-@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++            done = fold_sextract(&ctx, op);
-     case INDEX_op_bswap32_i64:
++            break;
-     case INDEX_op_bswap64_i64:
+         CASE_OP_32_64_VEC(sub):
-     case INDEX_op_extract_i64:
+             done = fold_sub(&ctx, op);
--        return &r_r;
+             break;
 +        return C_O1_I1(r, r);
 +
 +    case INDEX_op_st8_i32:
 +    case INDEX_op_st16_i32:
 +    case INDEX_op_st_i32:
 +    case INDEX_op_st8_i64:
 +    case INDEX_op_st16_i64:
 +    case INDEX_op_st32_i64:
 +    case INDEX_op_st_i64:
 +        return C_O0_I2(r, r);
      case INDEX_op_add_i32:
      case INDEX_op_and_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_rotl_i64:
      case INDEX_op_rotr_i64:
      case INDEX_op_setcond_i64:
 -        return &r_r_ri;
 +        return C_O1_I2(r, r, ri);
 +
      case INDEX_op_mul_i32:
      case INDEX_op_mul_i64:
 -        return &r_r_rI;
 +        return C_O1_I2(r, r, rI);
 +
      case INDEX_op_div_i32:
      case INDEX_op_divu_i32:
      case INDEX_op_nand_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_divu_i64:
      case INDEX_op_mulsh_i64:
      case INDEX_op_muluh_i64:
 -        return &r_r_r;
 +        return C_O1_I2(r, r, r);
 +
      case INDEX_op_sub_i32:
 -        return &r_rI_ri;
 +        return C_O1_I2(r, rI, ri);
      case INDEX_op_add_i64:
 -        return &r_r_rT;
 +        return C_O1_I2(r, r, rT);
      case INDEX_op_or_i64:
      case INDEX_op_xor_i64:
 -        return &r_r_rU;
 +        return C_O1_I2(r, r, rU);
      case INDEX_op_sub_i64:
 -        return &r_rI_rT;
 +        return C_O1_I2(r, rI, rT);
      case INDEX_op_clz_i32:
      case INDEX_op_ctz_i32:
      case INDEX_op_clz_i64:
      case INDEX_op_ctz_i64:
 -        return &r_r_rZW;
 +        return C_O1_I2(r, r, rZW);
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
 -        return &r_ri;
 +        return C_O0_I2(r, ri);
      case INDEX_op_movcond_i32:
      case INDEX_op_movcond_i64:
 -        return &movc;
 +        return C_O1_I4(r, r, ri, rZ, rZ);
      case INDEX_op_deposit_i32:
      case INDEX_op_deposit_i64:
 -        return &dep;
 +        return C_O1_I2(r, 0, rZ);
      case INDEX_op_brcond2_i32:
 -        return &br2;
 +        return C_O0_I4(r, r, ri, ri);
      case INDEX_op_setcond2_i32:
 -        return &setc2;
 +        return C_O1_I4(r, r, r, ri, ri);
      case INDEX_op_add2_i64:
      case INDEX_op_add2_i32:
 -        return &add2;
 +        return C_O2_I4(r, r, r, r, rI, rZM);
      case INDEX_op_sub2_i64:
      case INDEX_op_sub2_i32:
 -        return &sub2;
 +        return C_O2_I4(r, r, rI, rZM, r, r);
      case INDEX_op_qemu_ld_i32:
          return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
 -                ? &r_L : &r_L_L);
 +                ? C_O1_I1(r, L)
 +                : C_O1_I2(r, L, L));
 +
      case INDEX_op_qemu_st_i32:
          return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
 -                ? &S_S : &S_S_S);
 +                ? C_O0_I2(S, S)
 +                : C_O0_I3(S, S, S));
 +
      case INDEX_op_qemu_ld_i64:
 -        return (TCG_TARGET_REG_BITS == 64 ? &r_L
 -                : TARGET_LONG_BITS == 32 ? &L_L_L : &L_L_L_L);
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
 +                : TARGET_LONG_BITS == 32 ? C_O2_I1(L, L, L)
 +                : C_O2_I2(L, L, L, L));
 +
      case INDEX_op_qemu_st_i64:
 -        return (TCG_TARGET_REG_BITS == 64 ? &S_S
 -                : TARGET_LONG_BITS == 32 ? &S_S_S : &S_S_S_S);
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(S, S)
 +                : TARGET_LONG_BITS == 32 ? C_O0_I3(S, S, S)
 +                : C_O0_I4(S, S, S, S));
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_ppc_mulou_vec:
      case INDEX_op_ppc_pkum_vec:
      case INDEX_op_dup2_vec:
 -        return &v_v_v;
 +        return C_O1_I2(v, v, v);
 +
      case INDEX_op_not_vec:
      case INDEX_op_neg_vec:
 -        return &v_v;
 +        return C_O1_I1(v, v);
 +
      case INDEX_op_dup_vec:
 -        return have_isa_3_00 ? &v_vr : &v_v;
 +        return have_isa_3_00 ? C_O1_I1(v, vr) : C_O1_I1(v, v);
 +
      case INDEX_op_ld_vec:
 -    case INDEX_op_st_vec:
      case INDEX_op_dupm_vec:
 -        return &v_r;
 +        return C_O1_I1(v, r);
 +
 +    case INDEX_op_st_vec:
 +        return C_O0_I2(v, r);
 +
      case INDEX_op_bitsel_vec:
      case INDEX_op_ppc_msum_vec:
 -        return &v_v_v_v;
 +        return C_O1_I3(v, v, v, v);
      default:
          return NULL;
 --
 .25.1

-[PATCH 15/43] tcg: Expand TCGTemp.val to 64-bits
+[PULL 30/56] tcg/optimize: Split out fold_deposit
-This will reduce the differences between 32-bit and 64-bit hosts,
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-allowing full 64-bit constants to be created with the same interface.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h | 2 +-
+ tcg/optimize.c | 25 +++++++++++++++----------
- tcg/tcg.c         | 2 +-
+file changed, 15 insertions(+), 10 deletions(-)
 files changed, 2 insertions(+), 2 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGTemp {
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-     unsigned int mem_allocated:1;
+     return fold_const1(ctx, op);
-     unsigned int temp_allocated:1;
+ }
--    tcg_target_long val;
++static bool fold_deposit(OptContext *ctx, TCGOp *op)
-+    int64_t val;
++{
-     struct TCGTemp *mem_base;
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-     intptr_t mem_offset;
++        uint64_t t1 = arg_info(op->args[1])->val;
-     const char *name;
++        uint64_t t2 = arg_info(op->args[2])->val;
-diff --git a/tcg/tcg.c b/tcg/tcg.c
++
-index XXXXXXX..XXXXXXX 100644
++        t1 = deposit64(t1, op->args[3], op->args[4], t2);
---- a/tcg/tcg.c
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
-+++ b/tcg/tcg.c
++    }
-@@ -XXX,XX +XXX,XX @@ static void dump_regs(TCGContext *s)
++    return false;
-                    tcg_target_reg_names[ts->mem_base->reg]);
++}
 +
  static bool fold_divide(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
-         case TEMP_VAL_CONST:
--            printf("$0x%" TCG_PRIlx, ts->val);
+-        CASE_OP_32_64(deposit):
-+            printf("$0x%" PRIx64, ts->val);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = deposit64(arg_info(op->args[1])->val,
 -                                op->args[3], op->args[4],
 -                                arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
-         case TEMP_VAL_DEAD:
-             printf("D");
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(ctpop):
              done = fold_ctpop(&ctx, op);
              break;
 +        CASE_OP_32_64(deposit):
 +            done = fold_deposit(&ctx, op);
 +            break;
          CASE_OP_32_64(div):
          CASE_OP_32_64(divu):
              done = fold_divide(&ctx, op);
 --
 .25.1

-[PATCH 10/43] tcg: Remove TCG_TARGET_HAS_cmp_vec
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
-The cmp_vec opcode is mandatory; this symbol is unused.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.h | 1 -
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
- tcg/i386/tcg-target.h    | 1 -
+file changed, 18 insertions(+), 14 deletions(-)
  tcg/ppc/tcg-target.h     | 1 -
 files changed, 3 deletions(-)
-diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.h
+--- a/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_shi_vec          1
+     return true;
- #define TCG_TARGET_HAS_shs_vec          0
+ }
- #define TCG_TARGET_HAS_shv_vec          1
--#define TCG_TARGET_HAS_cmp_vec          1
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_mul_vec          1
++{
- #define TCG_TARGET_HAS_sat_vec          1
++    if (arg_is_const(op->args[1])) {
- #define TCG_TARGET_HAS_minmax_vec       1
++        uint64_t t = arg_info(op->args[1])->val;
-diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
++
-index XXXXXXX..XXXXXXX 100644
++        if (t != 0) {
---- a/tcg/i386/tcg-target.h
++            t = do_constant_folding(op->opc, t, 0);
-+++ b/tcg/i386/tcg-target.h
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-@@ -XXX,XX +XXX,XX @@ extern bool have_avx2;
++        }
- #define TCG_TARGET_HAS_shi_vec          1
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
- #define TCG_TARGET_HAS_shs_vec          1
++    }
- #define TCG_TARGET_HAS_shv_vec          have_avx2
++    return false;
--#define TCG_TARGET_HAS_cmp_vec          1
++}
- #define TCG_TARGET_HAS_mul_vec          1
++
- #define TCG_TARGET_HAS_sat_vec          1
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
- #define TCG_TARGET_HAS_minmax_vec       1
+ {
-diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
+     return fold_const1(ctx, op);
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
---- a/tcg/ppc/tcg-target.h
+             }
-+++ b/tcg/ppc/tcg-target.h
+             break;
-@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
- #define TCG_TARGET_HAS_shi_vec          0
+-        CASE_OP_32_64(clz):
- #define TCG_TARGET_HAS_shs_vec          0
+-        CASE_OP_32_64(ctz):
- #define TCG_TARGET_HAS_shv_vec          1
+-            if (arg_is_const(op->args[1])) {
--#define TCG_TARGET_HAS_cmp_vec          1
+-                TCGArg v = arg_info(op->args[1])->val;
- #define TCG_TARGET_HAS_mul_vec          1
+-                if (v != 0) {
- #define TCG_TARGET_HAS_sat_vec          1
+-                    tmp = do_constant_folding(opc, v, 0);
- #define TCG_TARGET_HAS_minmax_vec       1
+-                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                } else {
 -                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
 -                }
 -                continue;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_brcond2_i32:
              done = fold_brcond2(&ctx, op);
              break;
 +        CASE_OP_32_64(clz):
 +        CASE_OP_32_64(ctz):
 +            done = fold_count_zeros(&ctx, op);
 +            break;
          CASE_OP_32_64(ctpop):
              done = fold_ctpop(&ctx, op);
              break;
 --
 .25.1

-[PATCH 09/43] tcg/optimize: Fold dup2_vec
+[PULL 32/56] tcg/optimize: Split out fold_bswap
-When the two arguments are identical, this can be reduced to
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-dup_vec or to mov_vec from a tcg_constant_vec.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 15 +++++++++++++++
+ tcg/optimize.c | 27 ++++++++++++++++-----------
-file changed, 15 insertions(+)
+file changed, 16 insertions(+), 11 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++static bool fold_bswap(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        t = do_constant_folding(op->opc, t, op->args[2]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
-             goto do_default;
+             break;
-+        case INDEX_op_dup2_vec:
+-        CASE_OP_32_64(bswap16):
-+            assert(TCG_TARGET_REG_BITS == 32);
+-        CASE_OP_32_64(bswap32):
-+            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-        case INDEX_op_bswap64_i64:
-+                tmp = arg_info(op->args[1])->val;
+-            if (arg_is_const(op->args[1])) {
-+                if (tmp == arg_info(op->args[2])->val) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-+                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+-                                          op->args[2]);
-+                    break;
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-+                }
+-                continue;
-+            } else if (args_are_copies(op->args[1], op->args[2])) {
+-            }
-+                op->opc = INDEX_op_dup_vec;
+-            break;
-+                TCGOP_VECE(op) = MO_32;
+-
-+                nb_iargs = 1;
+         default:
-+            }
+             break;
-+            goto do_default;
-+
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         CASE_OP_32_64(not):
+         case INDEX_op_brcond2_i32:
-         CASE_OP_32_64(neg):
+             done = fold_brcond2(&ctx, op);
-         CASE_OP_32_64(ext8s):
+             break;
 +        CASE_OP_32_64(bswap16):
 +        CASE_OP_32_64(bswap32):
 +        case INDEX_op_bswap64_i64:
 +            done = fold_bswap(&ctx, op);
 +            break;
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              done = fold_count_zeros(&ctx, op);
 --
 .25.1

-[PATCH 01/43] tcg: Adjust simd_desc size encoding
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
-With larger vector sizes, it turns out oprsz == maxsz, and we only
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 need to represent mismatch for oprsz <= 32.  We do, however, need
 to represent larger oprsz and do so without reducing SIMD_DATA_BITS.
 Reduce the size of the oprsz field and increase the maxsz field.
 Steal the oprsz value of 24 to indicate equality with maxsz.
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-gvec-desc.h | 38 ++++++++++++++++++++++++-------------
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
- tcg/tcg-op-gvec.c           | 35 ++++++++++++++++++++++++++--------
+file changed, 31 insertions(+), 22 deletions(-)
 files changed, 52 insertions(+), 21 deletions(-)
-diff --git a/include/tcg/tcg-gvec-desc.h b/include/tcg/tcg-gvec-desc.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-gvec-desc.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg-gvec-desc.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
- #ifndef TCG_TCG_GVEC_DESC_H
+     return fold_const2(ctx, op);
- #define TCG_TCG_GVEC_DESC_H
+ }
--/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
++static bool fold_dup(OptContext *ctx, TCGOp *op)
--#define SIMD_OPRSZ_SHIFT   0
++{
--#define SIMD_OPRSZ_BITS    5
++    if (arg_is_const(op->args[1])) {
-+/*
++        uint64_t t = arg_info(op->args[1])->val;
-+ * This configuration allows MAXSZ to represent 2048 bytes, and
++        t = dup_const(TCGOP_VECE(op), t);
-+ * OPRSZ to match MAXSZ, or represent the smaller values 8, 16, or 32.
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-+ *
++    }
-+ * Encode this with:
++    return false;
 + *   0, 1, 3 -> 8, 16, 32
 + *   2       -> maxsz
 + *
 + * This steals the input that would otherwise map to 24 to match maxsz.
 + */
 +#define SIMD_MAXSZ_SHIFT   0
 +#define SIMD_MAXSZ_BITS    8
 -#define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
 -#define SIMD_MAXSZ_BITS    5
 +#define SIMD_OPRSZ_SHIFT   (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
 +#define SIMD_OPRSZ_BITS    2
 -#define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
 +#define SIMD_DATA_SHIFT    (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
  #define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
  /* Create a descriptor from components.  */
  uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
 -/* Extract the operation size from a descriptor.  */
 -static inline intptr_t simd_oprsz(uint32_t desc)
 -{
 -    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
 -}
 -
  /* Extract the max vector size from a descriptor.  */
  static inline intptr_t simd_maxsz(uint32_t desc)
  {
 -    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
 +    return extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) * 8 + 8;
 +}
 +
-+/* Extract the operation size from a descriptor.  */
++static bool fold_dup2(OptContext *ctx, TCGOp *op)
 +static inline intptr_t simd_oprsz(uint32_t desc)
 +{
-+    uint32_t f = extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS);
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-+    intptr_t o = f * 8 + 8;
++        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
-+    intptr_t m = simd_maxsz(desc);
++                               arg_info(op->args[2])->val);
-+    return f == 2 ? m : o;
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
  }
  /* Extract the operation-specific data from a descriptor.  */
 diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-gvec.c
 +++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ static const TCGOpcode vecop_list_empty[1] = { 0 };
     of the operand offsets so that we can check them all at once.  */
  static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  {
 -    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
 -    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
 -    tcg_debug_assert(oprsz > 0);
 -    tcg_debug_assert(oprsz <= maxsz);
 -    tcg_debug_assert((oprsz & opr_align) == 0);
 +    uint32_t max_align;
 +
 +    switch (oprsz) {
 +    case 8:
 +    case 16:
 +    case 32:
 +        tcg_debug_assert(oprsz <= maxsz);
 +        break;
 +    default:
 +        tcg_debug_assert(oprsz == maxsz);
 +        break;
 +    }
 +    tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
 +
 +    max_align = maxsz >= 16 ? 15 : 7;
      tcg_debug_assert((maxsz & max_align) == 0);
      tcg_debug_assert((ofs & max_align) == 0);
  }
@@ -XXX,XX +XXX,XX @@ uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  {
      uint32_t desc = 0;
 -    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
 -    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
 -    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
 +    check_size_align(oprsz, maxsz, 0);
 +    tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
      oprsz = (oprsz / 8) - 1;
      maxsz = (maxsz / 8) - 1;
 +
 +    /*
 +     * We have just asserted in check_size_align that either
 +     * oprsz is {8,16,32} or matches maxsz.  Encode the final
 +     * case with '2', as that would otherwise map to 24.
 +     */
 +    if (oprsz == maxsz) {
 +        oprsz = 2;
 +    }
 +
-     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
++    if (args_are_copies(op->args[1], op->args[2])) {
-     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
++        op->opc = INDEX_op_dup_vec;
-     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
++        TCGOP_VECE(op) = MO_32;
 +    }
 +    return false;
 +}
 +
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
      return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              break;
 -        case INDEX_op_dup_vec:
 -            if (arg_is_const(op->args[1])) {
 -                tmp = arg_info(op->args[1])->val;
 -                tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
 -        case INDEX_op_dup2_vec:
 -            assert(TCG_TARGET_REG_BITS == 32);
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0],
 -                                 deposit64(arg_info(op->args[1])->val, 32, 32,
 -                                           arg_info(op->args[2])->val));
 -                continue;
 -            } else if (args_are_copies(op->args[1], op->args[2])) {
 -                op->opc = INDEX_op_dup_vec;
 -                TCGOP_VECE(op) = MO_32;
 -            }
 -            break;
 -
          default:
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(divu):
              done = fold_divide(&ctx, op);
              break;
 +        case INDEX_op_dup_vec:
 +            done = fold_dup(&ctx, op);
 +            break;
 +        case INDEX_op_dup2_vec:
 +            done = fold_dup2(&ctx, op);
 +            break;
          CASE_OP_32_64(eqv):
              done = fold_eqv(&ctx, op);
              break;
 --
 .25.1

-[PATCH 06/43] tcg: Remove TCGOpDef.used
+[PULL 34/56] tcg/optimize: Split out fold_mov
-The last user of this field disappeared in f69d277ece4.
+This is the final entry in the main switch that was in a
 different form.  After this, we have the option to convert
 the switch into a function dispatch table.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h | 3 ---
+ tcg/optimize.c | 27 ++++++++++++++-------------
-file changed, 3 deletions(-)
+file changed, 14 insertions(+), 13 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGOpDef {
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-     uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
+     return true;
-     uint8_t flags;
+ }
-     TCGArgConstraint *args_ct;
--#if defined(CONFIG_DEBUG_TCG)
++static bool fold_mov(OptContext *ctx, TCGOp *op)
--    int used;
++{
--#endif
++    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
- } TCGOpDef;
++}
++
- extern TCGOpDef tcg_op_defs[];
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Propagate constants through copy operations and do constant
 -           folding.  Constants will be substituted to arguments by register
 -           allocator where needed and possible.  Also detect copies. */
 +        /*
 +         * Process each opcode.
 +         * Sorted alphabetically by opcode as much as possible.
 +         */
          switch (opc) {
 -        CASE_OP_32_64_VEC(mov):
 -            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            break;
 -
 -        default:
 -            break;
 -
 -        /* ---------------------------------------------------------- */
 -        /* Sorted alphabetically by opcode as much as possible. */
 -
          CASE_OP_32_64_VEC(add):
              done = fold_add(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64_VEC(mov):
 +            done = fold_mov(&ctx, op);
 +            break;
          CASE_OP_32_64(movcond):
              done = fold_movcond(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
              break;
 +        default:
 +            break;
          }
          if (!done) {
 --
 .25.1

-[PATCH 38/43] tcg/mips: Convert to tcg-constr.c.inc
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
+Pull the "op r, a, a => movi r, 0" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/mips/tcg-target-constr.h | 31 ++++++++++++
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
- tcg/mips/tcg-target.c.inc    | 95 ++++++++++++------------------------
+file changed, 24 insertions(+), 17 deletions(-)
 files changed, 61 insertions(+), 65 deletions(-)
  create mode 100644 tcg/mips/tcg-target-constr.h
-diff --git a/tcg/mips/tcg-target-constr.h b/tcg/mips/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-new file mode 100644
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/tcg/optimize.c
---- /dev/null
++++ b/tcg/optimize.c
-+++ b/tcg/mips/tcg-target-constr.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@
+     return false;
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
+ }
-+/*
-+ * MIPS target-specific operand constaints.
++/* If the binary operation has both arguments equal, fold to @i. */
-+ * Copyright (c) 2020 Linaro
++static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-+ */
++{
 +    if (args_are_copies(op->args[1], op->args[2])) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
-+C_O0_I1(r)
+ /*
-+C_O0_I2(rZ, r)
+  * These outermost fold_<op> functions are sorted alphabetically.
-+C_O0_I2(rZ, rZ)
+  */
-+C_O0_I2(SZ, S)
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-+C_O0_I3(SZ, S, S)
-+C_O0_I3(SZ, SZ, S)
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+C_O0_I4(rZ, rZ, rZ, rZ)
+ {
-+C_O0_I4(SZ, SZ, S, S)
+-    return fold_const2(ctx, op);
-+C_O1_I1(r, L)
++    if (fold_const2(ctx, op) ||
-+C_O1_I1(r, r)
++        fold_xx_to_i(ctx, op, 0)) {
-+C_O1_I2(r, 0, rZ)
++        return true;
-+C_O1_I2(r, L, L)
++    }
-+C_O1_I2(r, r, ri)
++    return false;
 +C_O1_I2(r, r, rI)
 +C_O1_I2(r, r, rIK)
 +C_O1_I2(r, r, rJ)
 +C_O1_I2(r, r, rWZ)
 +C_O1_I2(r, rZ, rN)
 +C_O1_I2(r, rZ, rZ)
 +C_O1_I4(r, rZ, rZ, rZ, 0)
 +C_O1_I4(r, rZ, rZ, rZ, rZ)
 +C_O2_I1(r, r, L)
 +C_O2_I2(r, r, L, L)
 +C_O2_I2(r, r, r, r)
 +C_O2_I4(r, r, rZ, rZ, rN, rN)
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      }
  }
-+/* Define all constraint sets. */
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
-+#include "../tcg-constr.c.inc"
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-+
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
+-    return fold_const2(ctx, op);
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
++    if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
++        fold_xx_to_i(ctx, op, 0)) {
--    static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
++        return true;
--    static const TCGTargetOpDef SZ_S = { .args_ct_str = { "SZ", "S" } };
++    }
--    static const TCGTargetOpDef rZ_rZ = { .args_ct_str = { "rZ", "rZ" } };
++    return false;
--    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
+ }
--    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
--    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
+ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_r_rI = { .args_ct_str = { "r", "r", "rI" } };
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_r_rJ = { .args_ct_str = { "r", "r", "rJ" } };
--    static const TCGTargetOpDef SZ_S_S = { .args_ct_str = { "SZ", "S", "S" } };
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef SZ_SZ_S
+ {
--        = { .args_ct_str = { "SZ", "SZ", "S" } };
+-    return fold_const2(ctx, op);
--    static const TCGTargetOpDef SZ_SZ_S_S
++    if (fold_const2(ctx, op) ||
--        = { .args_ct_str = { "SZ", "SZ", "S", "S" } };
++        fold_xx_to_i(ctx, op, 0)) {
--    static const TCGTargetOpDef r_rZ_rN
++        return true;
--        = { .args_ct_str = { "r", "rZ", "rN" } };
++    }
--    static const TCGTargetOpDef r_rZ_rZ
++    return false;
--        = { .args_ct_str = { "r", "rZ", "rZ" } };
+ }
--    static const TCGTargetOpDef r_r_rIK
--        = { .args_ct_str = { "r", "r", "rIK" } };
+ /* Propagate constants and copies, fold constant expressions. */
--    static const TCGTargetOpDef r_r_rWZ
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        = { .args_ct_str = { "r", "r", "rWZ" } };
+             break;
--    static const TCGTargetOpDef r_r_r_r
+         }
--        = { .args_ct_str = { "r", "r", "r", "r" } };
--    static const TCGTargetOpDef r_r_L_L
+-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
--        = { .args_ct_str = { "r", "r", "L", "L" } };
+-        switch (opc) {
--    static const TCGTargetOpDef dep
+-        CASE_OP_32_64_VEC(andc):
--        = { .args_ct_str = { "r", "0", "rZ" } };
+-        CASE_OP_32_64_VEC(sub):
--    static const TCGTargetOpDef movc
+-        CASE_OP_32_64_VEC(xor):
--        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "0" } };
+-            if (args_are_copies(op->args[1], op->args[2])) {
--    static const TCGTargetOpDef movc_r6
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
--        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "rZ" } };
+-                continue;
--    static const TCGTargetOpDef add2
+-            }
--        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rN", "rN" } };
+-            break;
--    static const TCGTargetOpDef br2
+-        default:
--        = { .args_ct_str = { "rZ", "rZ", "rZ", "rZ" } };
+-            break;
--    static const TCGTargetOpDef setc2
+-        }
 -        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "rZ" } };
 -
-     switch (op) {
+         /*
-     case INDEX_op_goto_ptr:
+          * Process each opcode.
--        return &r;
+          * Sorted alphabetically by opcode as much as possible.
 +        return C_O0_I1(r);
      case INDEX_op_ld8u_i32:
      case INDEX_op_ld8s_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_extrl_i64_i32:
      case INDEX_op_extrh_i64_i32:
      case INDEX_op_extract_i64:
 -        return &r_r;
 +        return C_O1_I1(r, r);
      case INDEX_op_st8_i32:
      case INDEX_op_st16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_st16_i64:
      case INDEX_op_st32_i64:
      case INDEX_op_st_i64:
 -        return &rZ_r;
 +        return C_O0_I2(rZ, r);
      case INDEX_op_add_i32:
      case INDEX_op_add_i64:
 -        return &r_r_rJ;
 +        return C_O1_I2(r, r, rJ);
      case INDEX_op_sub_i32:
      case INDEX_op_sub_i64:
 -        return &r_rZ_rN;
 +        return C_O1_I2(r, rZ, rN);
      case INDEX_op_mul_i32:
      case INDEX_op_mulsh_i32:
      case INDEX_op_muluh_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_remu_i64:
      case INDEX_op_nor_i64:
      case INDEX_op_setcond_i64:
 -        return &r_rZ_rZ;
 +        return C_O1_I2(r, rZ, rZ);
      case INDEX_op_muls2_i32:
      case INDEX_op_mulu2_i32:
      case INDEX_op_muls2_i64:
      case INDEX_op_mulu2_i64:
 -        return &r_r_r_r;
 +        return C_O2_I2(r, r, r, r);
      case INDEX_op_and_i32:
      case INDEX_op_and_i64:
 -        return &r_r_rIK;
 +        return C_O1_I2(r, r, rIK);
      case INDEX_op_or_i32:
      case INDEX_op_xor_i32:
      case INDEX_op_or_i64:
      case INDEX_op_xor_i64:
 -        return &r_r_rI;
 +        return C_O1_I2(r, r, rI);
      case INDEX_op_shl_i32:
      case INDEX_op_shr_i32:
      case INDEX_op_sar_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_sar_i64:
      case INDEX_op_rotr_i64:
      case INDEX_op_rotl_i64:
 -        return &r_r_ri;
 +        return C_O1_I2(r, r, ri);
      case INDEX_op_clz_i32:
      case INDEX_op_clz_i64:
 -        return &r_r_rWZ;
 +        return C_O1_I2(r, r, rWZ);
      case INDEX_op_deposit_i32:
      case INDEX_op_deposit_i64:
 -        return &dep;
 +        return C_O1_I2(r, 0, rZ);
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
 -        return &rZ_rZ;
 +        return C_O0_I2(rZ, rZ);
      case INDEX_op_movcond_i32:
      case INDEX_op_movcond_i64:
 -        return use_mips32r6_instructions ? &movc_r6 : &movc;
 -
 +        return (use_mips32r6_instructions
 +                ? C_O1_I4(r, rZ, rZ, rZ, rZ)
 +                : C_O1_I4(r, rZ, rZ, rZ, 0));
      case INDEX_op_add2_i32:
      case INDEX_op_sub2_i32:
 -        return &add2;
 +        return C_O2_I4(r, r, rZ, rZ, rN, rN);
      case INDEX_op_setcond2_i32:
 -        return &setc2;
 +        return C_O1_I4(r, rZ, rZ, rZ, rZ);
      case INDEX_op_brcond2_i32:
 -        return &br2;
 +        return C_O0_I4(rZ, rZ, rZ, rZ);
      case INDEX_op_qemu_ld_i32:
          return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
 -                ? &r_L : &r_L_L);
 +                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
      case INDEX_op_qemu_st_i32:
          return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
 -                ? &SZ_S : &SZ_S_S);
 +                ? C_O0_I2(SZ, S) : C_O0_I3(SZ, S, S));
      case INDEX_op_qemu_ld_i64:
 -        return (TCG_TARGET_REG_BITS == 64 ? &r_L
 -                : TARGET_LONG_BITS == 32 ? &r_r_L : &r_r_L_L);
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
 +                : TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, L)
 +                : C_O2_I2(r, r, L, L));
      case INDEX_op_qemu_st_i64:
 -        return (TCG_TARGET_REG_BITS == 64 ? &SZ_S
 -                : TARGET_LONG_BITS == 32 ? &SZ_SZ_S : &SZ_SZ_S_S);
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(SZ, S)
 +                : TARGET_LONG_BITS == 32 ? C_O0_I3(SZ, SZ, S)
 +                : C_O0_I4(SZ, SZ, S, S));
      default:
          return NULL;
 --
 .25.1

-[PATCH 37/43] tcg/arm: Convert to tcg-constr.c.inc
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
+Pull the "op r, a, a => mov r, a" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/arm/tcg-target-constr.h | 30 ++++++++++++
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
- tcg/arm/tcg-target.c.inc    | 93 +++++++++++++------------------------
+file changed, 24 insertions(+), 15 deletions(-)
 files changed, 63 insertions(+), 60 deletions(-)
  create mode 100644 tcg/arm/tcg-target-constr.h
-diff --git a/tcg/arm/tcg-target-constr.h b/tcg/arm/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-new file mode 100644
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/tcg/optimize.c
---- /dev/null
++++ b/tcg/optimize.c
-+++ b/tcg/arm/tcg-target-constr.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-@@ -XXX,XX +XXX,XX @@
+     return false;
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
+ }
-+/*
-+ * ARM32 target-specific operand constaints.
++/* If the binary operation has both arguments equal, fold to identity. */
-+ * Copyright (c) 2020 Linaro
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
-+ */
++{
 +    if (args_are_copies(op->args[1], op->args[2])) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
-+C_O0_I1(r)
+ /*
-+C_O0_I2(r, r)
+  * These outermost fold_<op> functions are sorted alphabetically.
-+C_O0_I2(r, rIN)
++ *
-+C_O0_I2(s, s)
++ * The ordering of the transformations should be:
-+C_O0_I3(s, s, s)
++ *   1) those that produce a constant
-+C_O0_I4(r, r, rI, rI)
++ *   2) those that produce a copy
-+C_O0_I4(s, s, s, s)
++ *   3) those that produce information about the result value.
-+C_O1_I1(r, l)
+  */
-+C_O1_I1(r, r)
-+C_O1_I2(r, 0, rZ)
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-+C_O1_I2(r, l, l)
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-+C_O1_I2(r, r, r)
-+C_O1_I2(r, r, rI)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
-+C_O1_I2(r, r, rIK)
+ {
-+C_O1_I2(r, r, rIN)
+-    return fold_const2(ctx, op);
-+C_O1_I2(r, r, ri)
++    if (fold_const2(ctx, op) ||
-+C_O1_I2(r, rZ, rZ)
++        fold_xx_to_x(ctx, op)) {
-+C_O1_I4(r, r, r, rI, rI)
++        return true;
-+C_O1_I4(r, r, rIN, rIK, 0)
++    }
-+C_O2_I1(r, r, l)
++    return false;
 +C_O2_I2(r, r, l, l)
 +C_O2_I2(r, r, r, r)
 +C_O2_I4(r, r, r, r, rIN, rIK)
 +C_O2_I4(r, r, rI, rI, rIN, rIK)
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
      }
  }
-+/* Define all constraint sets. */
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+#include "../tcg-constr.c.inc"
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
-+
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
  {
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
+-    return fold_const2(ctx, op);
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
++    if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef s_s = { .args_ct_str = { "s", "s" } };
++        fold_xx_to_x(ctx, op)) {
--    static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
++        return true;
--    static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
++    }
--    static const TCGTargetOpDef r_r_l = { .args_ct_str = { "r", "r", "l" } };
++    return false;
--    static const TCGTargetOpDef r_l_l = { .args_ct_str = { "r", "l", "l" } };
+ }
--    static const TCGTargetOpDef s_s_s = { .args_ct_str = { "s", "s", "s" } };
--    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_r_rI = { .args_ct_str = { "r", "r", "rI" } };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    static const TCGTargetOpDef r_r_rIN
+             break;
--        = { .args_ct_str = { "r", "r", "rIN" } };
+         }
--    static const TCGTargetOpDef r_r_rIK
--        = { .args_ct_str = { "r", "r", "rIK" } };
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
--    static const TCGTargetOpDef r_r_r_r
+-        switch (opc) {
--        = { .args_ct_str = { "r", "r", "r", "r" } };
+-        CASE_OP_32_64_VEC(or):
--    static const TCGTargetOpDef r_r_l_l
+-        CASE_OP_32_64_VEC(and):
--        = { .args_ct_str = { "r", "r", "l", "l" } };
+-            if (args_are_copies(op->args[1], op->args[2])) {
--    static const TCGTargetOpDef s_s_s_s
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
--        = { .args_ct_str = { "s", "s", "s", "s" } };
+-                continue;
--    static const TCGTargetOpDef br
+-            }
--        = { .args_ct_str = { "r", "rIN" } };
+-            break;
--    static const TCGTargetOpDef ext2
+-        default:
--        = { .args_ct_str = { "r", "rZ", "rZ" } };
+-            break;
--    static const TCGTargetOpDef dep
+-        }
 -        = { .args_ct_str = { "r", "0", "rZ" } };
 -    static const TCGTargetOpDef movc
 -        = { .args_ct_str = { "r", "r", "rIN", "rIK", "0" } };
 -    static const TCGTargetOpDef add2
 -        = { .args_ct_str = { "r", "r", "r", "r", "rIN", "rIK" } };
 -    static const TCGTargetOpDef sub2
 -        = { .args_ct_str = { "r", "r", "rI", "rI", "rIN", "rIK" } };
 -    static const TCGTargetOpDef br2
 -        = { .args_ct_str = { "r", "r", "rI", "rI" } };
 -    static const TCGTargetOpDef setc2
 -        = { .args_ct_str = { "r", "r", "r", "rI", "rI" } };
 -
-     switch (op) {
+         /*
-     case INDEX_op_goto_ptr:
+          * Process each opcode.
--        return &r;
+          * Sorted alphabetically by opcode as much as possible.
 +        return C_O0_I1(r);
      case INDEX_op_ld8u_i32:
      case INDEX_op_ld8s_i32:
      case INDEX_op_ld16u_i32:
      case INDEX_op_ld16s_i32:
      case INDEX_op_ld_i32:
 -    case INDEX_op_st8_i32:
 -    case INDEX_op_st16_i32:
 -    case INDEX_op_st_i32:
      case INDEX_op_neg_i32:
      case INDEX_op_not_i32:
      case INDEX_op_bswap16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_ext16u_i32:
      case INDEX_op_extract_i32:
      case INDEX_op_sextract_i32:
 -        return &r_r;
 +        return C_O1_I1(r, r);
 +
 +    case INDEX_op_st8_i32:
 +    case INDEX_op_st16_i32:
 +    case INDEX_op_st_i32:
 +        return C_O0_I2(r, r);
      case INDEX_op_add_i32:
      case INDEX_op_sub_i32:
      case INDEX_op_setcond_i32:
 -        return &r_r_rIN;
 +        return C_O1_I2(r, r, rIN);
 +
      case INDEX_op_and_i32:
      case INDEX_op_andc_i32:
      case INDEX_op_clz_i32:
      case INDEX_op_ctz_i32:
 -        return &r_r_rIK;
 +        return C_O1_I2(r, r, rIK);
 +
      case INDEX_op_mul_i32:
      case INDEX_op_div_i32:
      case INDEX_op_divu_i32:
 -        return &r_r_r;
 +        return C_O1_I2(r, r, r);
 +
      case INDEX_op_mulu2_i32:
      case INDEX_op_muls2_i32:
 -        return &r_r_r_r;
 +        return C_O2_I2(r, r, r, r);
 +
      case INDEX_op_or_i32:
      case INDEX_op_xor_i32:
 -        return &r_r_rI;
 +        return C_O1_I2(r, r, rI);
 +
      case INDEX_op_shl_i32:
      case INDEX_op_shr_i32:
      case INDEX_op_sar_i32:
      case INDEX_op_rotl_i32:
      case INDEX_op_rotr_i32:
 -        return &r_r_ri;
 +        return C_O1_I2(r, r, ri);
      case INDEX_op_brcond_i32:
 -        return &br;
 +        return C_O0_I2(r, rIN);
      case INDEX_op_deposit_i32:
 -        return &dep;
 +        return C_O1_I2(r, 0, rZ);
      case INDEX_op_extract2_i32:
 -        return &ext2;
 +        return C_O1_I2(r, rZ, rZ);
      case INDEX_op_movcond_i32:
 -        return &movc;
 +        return C_O1_I4(r, r, rIN, rIK, 0);
      case INDEX_op_add2_i32:
 -        return &add2;
 +        return C_O2_I4(r, r, r, r, rIN, rIK);
      case INDEX_op_sub2_i32:
 -        return &sub2;
 +        return C_O2_I4(r, r, rI, rI, rIN, rIK);
      case INDEX_op_brcond2_i32:
 -        return &br2;
 +        return C_O0_I4(r, r, rI, rI);
      case INDEX_op_setcond2_i32:
 -        return &setc2;
 +        return C_O1_I4(r, r, r, rI, rI);
      case INDEX_op_qemu_ld_i32:
 -        return TARGET_LONG_BITS == 32 ? &r_l : &r_l_l;
 +        return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
      case INDEX_op_qemu_ld_i64:
 -        return TARGET_LONG_BITS == 32 ? &r_r_l : &r_r_l_l;
 +        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
      case INDEX_op_qemu_st_i32:
 -        return TARGET_LONG_BITS == 32 ? &s_s : &s_s_s;
 +        return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
      case INDEX_op_qemu_st_i64:
 -        return TARGET_LONG_BITS == 32 ? &s_s_s : &s_s_s_s;
 +        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
      default:
          return NULL;
 --
 .25.1

-[PATCH 14/43] tcg: Add temp_readonly
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
-In most, but not all, places that we check for TEMP_FIXED,
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
-we are really testing that we do not modify the temporary.
+and use it in the outer opcode fold functions.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  5 +++++
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
- tcg/tcg.c         | 21 ++++++++++-----------
+file changed, 20 insertions(+), 18 deletions(-)
 files changed, 15 insertions(+), 11 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct TCGContext {
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-     target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
+     return false;
- };
+ }
-+static inline bool temp_readonly(TCGTemp *ts)
++/* If the binary operation has second argument @i, fold to @i. */
 +static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
-+    return ts->kind == TEMP_FIXED;
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
- extern TCGContext tcg_init_ctx;
+ /* If the binary operation has both arguments equal, fold to @i. */
- extern __thread TCGContext *tcg_ctx;
+ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  extern TCGv_env cpu_env;
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
     mark it free; otherwise mark it dead.  */
  static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
  {
--    if (ts->kind == TEMP_FIXED) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-+    if (temp_readonly(ts)) {
+ static bool fold_and(OptContext *ctx, TCGOp *op)
-         return;
+ {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_i(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
-     if (ts->val_type == TEMP_VAL_REG) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
- static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
                        TCGRegSet preferred_regs, int free_or_dead)
  {
--    if (ts->kind == TEMP_FIXED) {
+-    return fold_const2(ctx, op);
-+    if (temp_readonly(ts)) {
++    if (fold_const2(ctx, op) ||
-         return;
++        fold_xi_to_i(ctx, op, 0)) {
-     }
++        return true;
-     if (!ts->mem_coherent) {
++    }
-@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
++    return false;
  }
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
-     /* The liveness analysis already ensures that globals are back
+-    return fold_const2(ctx, op);
-        in memory. Keep an tcg_debug_assert for safety. */
++    if (fold_const2(ctx, op) ||
--    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
++        fold_xi_to_i(ctx, op, 0)) {
--                     || ts->kind == TEMP_FIXED);
++        return true;
-+    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || temp_readonly(ts));
++    }
 +    return false;
  }
- /* save globals to their canonical location and assume they can be
+ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-                                   TCGRegSet preferred_regs)
+             continue;
- {
+         }
-     /* ENV should not be modified.  */
--    tcg_debug_assert(ots->kind != TEMP_FIXED);
+-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-+    tcg_debug_assert(!temp_readonly(ots));
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(and):
-     /* The movi is not explicitly generated here.  */
+-        CASE_OP_32_64_VEC(mul):
-     if (ots->val_type == TEMP_VAL_REG) {
+-        CASE_OP_32_64(muluh):
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
+-        CASE_OP_32_64(mulsh):
-     ts = arg_temp(op->args[1]);
+-            if (arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == 0) {
-     /* ENV should not be modified.  */
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
--    tcg_debug_assert(ots->kind != TEMP_FIXED);
+-                continue;
-+    tcg_debug_assert(!temp_readonly(ots));
+-            }
+-            break;
-     /* Note that otype != itype for no-op truncation.  */
+-        default:
-     otype = ots->type;
+-            break;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
+-        }
-                  * Store the source register into the destination slot
+-
-                  * and leave the destination temp as TEMP_VAL_MEM.
+         /*
-                  */
+          * Process each opcode.
--                assert(ots->kind != TEMP_FIXED);
+          * Sorted alphabetically by opcode as much as possible.
 +                assert(!temp_readonly(ots));
                  if (!ts->mem_allocated) {
                      temp_allocate_frame(s, ots);
                  }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
      its = arg_temp(op->args[1]);
      /* ENV should not be modified.  */
 -    tcg_debug_assert(ots->kind != TEMP_FIXED);
 +    tcg_debug_assert(!temp_readonly(ots));
      itype = its->type;
      vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
              ts = arg_temp(arg);
              /* ENV should not be modified.  */
 -            tcg_debug_assert(ts->kind != TEMP_FIXED);
 +            tcg_debug_assert(!temp_readonly(ts));
              if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                  reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          ts = arg_temp(op->args[i]);
          /* ENV should not be modified.  */
 -        tcg_debug_assert(ts->kind != TEMP_FIXED);
 +        tcg_debug_assert(!temp_readonly(ts));
          if (NEED_SYNC_ARG(i)) {
              temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
          ts = arg_temp(arg);
          /* ENV should not be modified.  */
 -        tcg_debug_assert(ts->kind != TEMP_FIXED);
 +        tcg_debug_assert(!temp_readonly(ts));
          reg = tcg_target_call_oarg_regs[i];
          tcg_debug_assert(s->reg_to_temp[reg] == NULL);
 --
 .25.1

-[PATCH 18/43] tcg: Introduce TYPE_CONST temporaries
+[PULL 38/56] tcg/optimize: Add type to OptContext
-These will hold a single constant for the duration of the TB.
+Compute the type of the operation early.
-They are hashed, so that each value has one temp across the TB.
+There are at least 4 places that used a def->flags ladder
-Not used yet, this is all infrastructure.
+to determine the type of the operation being optimized.
 There were two places that assumed !TCG_OPF_64BIT means
 TCG_TYPE_I32, and so could potentially compute incorrect
 results for vector operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  24 +++++-
+ tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
- tcg/optimize.c    |  13 +++-
+file changed, 89 insertions(+), 60 deletions(-)
- tcg/tcg.c         | 195 ++++++++++++++++++++++++++++++++++++----------
 files changed, 188 insertions(+), 44 deletions(-)
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/tcg/tcg.h
 +++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempKind {
      TEMP_GLOBAL,
      /* Temp is in a fixed register. */
      TEMP_FIXED,
 +    /* Temp is a fixed constant. */
 +    TEMP_CONST,
  } TCGTempKind;
  typedef struct TCGTemp {
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
      QSIMPLEQ_HEAD(, TCGOp) plugin_ops;
  #endif
 +    GHashTable *const_table[TCG_TYPE_COUNT];
      TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
      TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
  static inline bool temp_readonly(TCGTemp *ts)
  {
 -    return ts->kind == TEMP_FIXED;
 +    return ts->kind >= TEMP_FIXED;
  }
  extern TCGContext tcg_init_ctx;
@@ -XXX,XX +XXX,XX @@ TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc);
  void tcg_optimize(TCGContext *s);
 +/* Allocate a new temporary and initialize it with a constant. */
  TCGv_i32 tcg_const_i32(int32_t val);
  TCGv_i64 tcg_const_i64(int64_t val);
  TCGv_i32 tcg_const_local_i32(int32_t val);
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec(TCGType);
  TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
  TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
 +/*
 + * Locate or create a read-only temporary that is a constant.
 + * This kind of temporary need not and should not be freed.
 + */
 +TCGTemp *tcg_constant_internal(TCGType type, int64_t val);
 +
 +static inline TCGv_i32 tcg_constant_i32(int32_t val)
 +{
 +    return temp_tcgv_i32(tcg_constant_internal(TCG_TYPE_I32, val));
 +}
 +
 +static inline TCGv_i64 tcg_constant_i64(int64_t val)
 +{
 +    return temp_tcgv_i64(tcg_constant_internal(TCG_TYPE_I64, val));
 +}
 +
 +TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
 +
  #if UINTPTR_MAX == UINT32_MAX
  # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
  # define tcg_const_local_ptr(x)  ((TCGv_ptr)tcg_const_local_i32((intptr_t)(x)))
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TempOptInfo *infos,
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-         ts->state_ptr = ti;
-         ti->next_copy = ts;
+     /* In flight values from optimization. */
-         ti->prev_copy = ts;
+     uint64_t z_mask;
--        ti->is_const = false;
++    TCGType type;
--        ti->mask = -1;
+ } OptContext;
-+        if (ts->kind == TEMP_CONST) {
-+            ti->is_const = true;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-+            ti->val = ti->mask = ts->val;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-+            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+ {
-+                /* High bits of a 32-bit quantity are garbage.  */
+     TCGTemp *dst_ts = arg_temp(dst);
-+                ti->mask |= ~0xffffffffull;
+     TCGTemp *src_ts = arg_temp(src);
-+            }
+-    const TCGOpDef *def;
-+        } else {
+     TempOptInfo *di;
-+            ti->is_const = false;
+     TempOptInfo *si;
-+            ti->mask = -1;
+     uint64_t z_mask;
-+        }
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-         set_bit(idx, temps_used->l);
+     reset_ts(dst_ts);
-     }
+     di = ts_info(dst_ts);
- }
+     si = ts_info(src_ts);
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+-    def = &tcg_op_defs[op->opc];
-index XXXXXXX..XXXXXXX 100644
+-    if (def->flags & TCG_OPF_VECTOR) {
---- a/tcg/tcg.c
+-        new_op = INDEX_op_mov_vec;
-+++ b/tcg/tcg.c
+-    } else if (def->flags & TCG_OPF_64BIT) {
-@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
+-        new_op = INDEX_op_mov_i64;
-     bigendian = 1;
+-    } else {
- #endif
++
++    switch (ctx->type) {
--    if (base_ts->kind != TEMP_FIXED) {
++    case TCG_TYPE_I32:
-+    switch (base_ts->kind) {
+         new_op = INDEX_op_mov_i32;
 +    case TEMP_FIXED:
 +        break;
-+    case TEMP_GLOBAL:
++    case TCG_TYPE_I64:
-         /* We do not support double-indirect registers.  */
++        new_op = INDEX_op_mov_i64;
-         tcg_debug_assert(!base_ts->indirect_reg);
++        break;
-         base_ts->indirect_base = 1;
++    case TCG_TYPE_V64:
-         s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64
++    case TCG_TYPE_V128:
-                             ? 2 : 1);
++    case TCG_TYPE_V256:
-         indirect_reg = 1;
++        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
+     op->opc = new_op;
-     if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
+-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
-@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
+     op->args[0] = dst;
-     TCGContext *s = tcg_ctx;
+     op->args[1] = src;
-     int k, idx;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-+    /* In order to simplify users of tcg_constant_*, silently ignore free. */
+ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-+    if (ts->kind == TEMP_CONST) {
+                              TCGArg dst, uint64_t val)
-+        return;
+ {
-+    }
+-    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
- #if defined(CONFIG_DEBUG_TCG)
+     init_ts_info(ctx, tv);
-     s->temps_in_use--;
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
      if (s->temps_in_use < 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
      set_bit(idx, s->free_temps[k].l);
  }
+@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
-+TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
+     }
-+{
+ }
-+    TCGContext *s = tcg_ctx;
-+    GHashTable *h = s->const_table[type];
+-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
-+    TCGTemp *ts;
++static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
-+
++                                    uint64_t x, uint64_t y)
-+    if (h == NULL) {
+ {
-+        h = g_hash_table_new(g_int64_hash, g_int64_equal);
+-    const TCGOpDef *def = &tcg_op_defs[op];
-+        s->const_table[type] = h;
+     uint64_t res = do_constant_folding_2(op, x, y);
-+    }
+-    if (!(def->flags & TCG_OPF_64BIT)) {
-+
++    if (type == TCG_TYPE_I32) {
-+    ts = g_hash_table_lookup(h, &val);
+         res = (int32_t)res;
-+    if (ts == NULL) {
+     }
-+        ts = tcg_temp_alloc(s);
+     return res;
-+
+@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
-+        if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
+  * Return -1 if the condition can't be simplified,
-+            TCGTemp *ts2 = tcg_temp_alloc(s);
+  * and the result of the condition (0 or 1) if it can.
-+
+  */
-+            ts->base_type = TCG_TYPE_I64;
+-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
-+            ts->type = TCG_TYPE_I32;
++static int do_constant_folding_cond(TCGType type, TCGArg x,
-+            ts->kind = TEMP_CONST;
+                                     TCGArg y, TCGCond c)
-+            ts->temp_allocated = 1;
+ {
-+            /*
+     uint64_t xv = arg_info(x)->val;
-+             * Retain the full value of the 64-bit constant in the low
+     uint64_t yv = arg_info(y)->val;
-+             * part, so that the hash table works.  Actual uses will
-+             * truncate the value to the low part.
+     if (arg_is_const(x) && arg_is_const(y)) {
-+             */
+-        const TCGOpDef *def = &tcg_op_defs[op];
-+            ts->val = val;
+-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-+
+-        if (def->flags & TCG_OPF_64BIT) {
-+            tcg_debug_assert(ts2 == ts + 1);
+-            return do_constant_folding_cond_64(xv, yv, c);
-+            ts2->base_type = TCG_TYPE_I64;
+-        } else {
-+            ts2->type = TCG_TYPE_I32;
++        switch (type) {
-+            ts2->kind = TEMP_CONST;
++        case TCG_TYPE_I32:
-+            ts2->temp_allocated = 1;
+             return do_constant_folding_cond_32(xv, yv, c);
-+            ts2->val = val >> 32;
++        case TCG_TYPE_I64:
-+        } else {
++            return do_constant_folding_cond_64(xv, yv, c);
-+            ts->base_type = type;
++        default:
-+            ts->type = type;
++            /* Only scalar comparisons are optimizable */
-+            ts->kind = TEMP_CONST;
++            return -1;
-+            ts->temp_allocated = 1;
+         }
-+            ts->val = val;
+     } else if (args_are_copies(x, y)) {
-+        }
+         return do_constant_folding_cond_eq(c);
-+        g_hash_table_insert(h, &ts->val, ts);
+@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
-+    }
+         uint64_t t;
-+
-+    return ts;
+         t = arg_info(op->args[1])->val;
-+}
+-        t = do_constant_folding(op->opc, t, 0);
-+
++        t = do_constant_folding(op->opc, ctx->type, t, 0);
-+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-+{
+     }
-+    val = dup_const(vece, val);
+     return false;
-+    return temp_tcgv_vec(tcg_constant_internal(type, val));
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-+}
+         uint64_t t1 = arg_info(op->args[1])->val;
-+
+         uint64_t t2 = arg_info(op->args[2])->val;
- TCGv_i32 tcg_const_i32(int32_t val)
- {
+-        t1 = do_constant_folding(op->opc, t1, t2);
-     TCGv_i32 t0;
++        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_start(TCGContext *s)
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
-         TCGTempVal val = TEMP_VAL_MEM;
+     }
+     return false;
-         switch (ts->kind) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-+        case TEMP_CONST:
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
-+            val = TEMP_VAL_CONST;
+ {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
-         case TEMP_FIXED:
-             val = TEMP_VAL_REG;
-             break;
-@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
-     case TEMP_NORMAL:
-         snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
-         break;
-+    case TEMP_CONST:
-+        switch (ts->type) {
-+        case TCG_TYPE_I32:
-+            snprintf(buf, buf_size, "$0x%x", (int32_t)ts->val);
-+            break;
-+#if TCG_TARGET_REG_BITS > 32
 +        case TCG_TYPE_I64:
-+            snprintf(buf, buf_size, "$0x%" PRIx64, ts->val);
++            opc = INDEX_op_setcond_i64;
 +            break;
 +#endif
 +        case TCG_TYPE_V64:
 +        case TCG_TYPE_V128:
 +        case TCG_TYPE_V256:
 +            snprintf(buf, buf_size, "v%d$0x%" PRIx64,
 +                     64 << (ts->type - TCG_TYPE_V64), ts->val);
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
-+        break;
-     }
+         if (tv == 1 && fv == 0) {
-     return buf;
+             op->opc = opc;
- }
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
-             state = TS_DEAD | TS_MEM;
+ {
-             break;
+     TCGCond cond = op->args[3];
-         case TEMP_NORMAL:
+-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
-+        case TEMP_CONST:
++    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
-             state = TS_DEAD;
-             break;
+     if (i >= 0) {
-         default:
+         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
-@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-    mark it free; otherwise mark it dead.  */
+          * Simplify EQ/NE comparisons where one of the pairs
- static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
+          * can be simplified.
- {
+          */
--    if (temp_readonly(ts)) {
+-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
-+    TCGTempVal new_type;
++        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
-+    switch (ts->kind) {
+         /* For commutative operations make constant second argument */
-+    case TEMP_FIXED:
+         switch (opc) {
-         return;
+         CASE_OP_32_64_VEC(add):
-+    case TEMP_GLOBAL:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    case TEMP_LOCAL:
+                     /* Proceed with possible constant folding. */
-+        new_type = TEMP_VAL_MEM;
+                     break;
 +        break;
 +    case TEMP_NORMAL:
 +        new_type = free_or_dead < 0 ? TEMP_VAL_MEM : TEMP_VAL_DEAD;
 +        break;
 +    case TEMP_CONST:
 +        new_type = TEMP_VAL_CONST;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      if (ts->val_type == TEMP_VAL_REG) {
          s->reg_to_temp[ts->reg] = NULL;
      }
 -    ts->val_type = (free_or_dead < 0
 -                    || ts->kind != TEMP_NORMAL
 -                    ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
 +    ts->val_type = new_type;
  }
  /* Mark a temporary as dead.  */
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
  static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                        TCGRegSet preferred_regs, int free_or_dead)
  {
 -    if (temp_readonly(ts)) {
 -        return;
 -    }
 -    if (!ts->mem_coherent) {
 +    if (!temp_readonly(ts) && !ts->mem_coherent) {
          if (!ts->mem_allocated) {
              temp_allocate_frame(s, ts);
          }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
      for (i = s->nb_globals; i < s->nb_temps; i++) {
          TCGTemp *ts = &s->temps[i];
 -        if (ts->kind == TEMP_LOCAL) {
 +
 +        switch (ts->kind) {
 +        case TEMP_LOCAL:
              temp_save(s, ts, allocated_regs);
 -        } else {
 +            break;
 +        case TEMP_NORMAL:
              /* The liveness analysis already ensures that temps are dead.
                 Keep an tcg_debug_assert for safety. */
              tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
 +            break;
 +        case TEMP_CONST:
 +            /* Similarly, we should have freed any allocated register. */
 +            tcg_debug_assert(ts->val_type == TEMP_VAL_CONST);
 +            break;
 +        default:
 +            g_assert_not_reached();
          }
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          i_preferred_regs = o_preferred_regs = 0;
          if (arg_ct->ialias) {
              o_preferred_regs = op->output_pref[arg_ct->alias_index];
 -            if (ts->kind == TEMP_FIXED) {
 -                /* if fixed register, we must allocate a new register
 -                   if the alias is not the same register */
 -                if (arg != op->args[arg_ct->alias_index]) {
 -                    goto allocate_in_reg;
 -                }
 -            } else {
 -                /* if the input is aliased to an output and if it is
 -                   not dead after the instruction, we must allocate
 -                   a new register and move it */
 -                if (!IS_DEAD_ARG(i)) {
 -                    goto allocate_in_reg;
 -                }
 -                /* check if the current register has already been allocated
 -                   for another input aliased to an output */
 -                if (ts->val_type == TEMP_VAL_REG) {
 -                    int k2, i2;
 -                    reg = ts->reg;
 -                    for (k2 = 0 ; k2 < k ; k2++) {
 -                        i2 = def->args_ct[nb_oargs + k2].sort_index;
 -                        if (def->args_ct[i2].ialias && reg == new_args[i2]) {
 -                            goto allocate_in_reg;
 -                        }
 +            /*
 +             * If the input is readonly, then it cannot also be an
 +             * output and aliased to itself.  If the input is not
 +             * dead after the instruction, we must allocate a new
 +             * register and move it.
 +             */
 +            if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
 +                goto allocate_in_reg;
 +            }
 +
 +            /*
 +             * Check if the current register has already been allocated
 +             * for another input aliased to an output.
 +             */
 +            if (ts->val_type == TEMP_VAL_REG) {
 +                reg = ts->reg;
 +                for (int k2 = 0; k2 < k; k2++) {
 +                    int i2 = def->args_ct[nb_oargs + k2].sort_index;
 +                    if (def->args_ct[i2].ialias && reg == new_args[i2]) {
 +                        goto allocate_in_reg;
                      }
                  }
--                i_preferred_regs = o_preferred_regs;
+-                if (opc == INDEX_op_sub_i32) {
-             }
++                switch (ctx.type) {
-+            i_preferred_regs = o_preferred_regs;
++                case TCG_TYPE_I32:
-         }
+                     neg_op = INDEX_op_neg_i32;
+                     have_neg = TCG_TARGET_HAS_neg_i32;
-         temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
+-                } else if (opc == INDEX_op_sub_i64) {
-         reg = ts->reg;
++                    break;
++                case TCG_TYPE_I64:
--        if (tcg_regset_test_reg(arg_ct->regs, reg)) {
+                     neg_op = INDEX_op_neg_i64;
--            /* nothing to do : the constraint is satisfied */
+                     have_neg = TCG_TARGET_HAS_neg_i64;
--        } else {
+-                } else if (TCG_TARGET_HAS_neg_vec) {
--        allocate_in_reg:
+-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
--            /* allocate a new register matching the constraint
+-                    unsigned vece = TCGOP_VECE(op);
--               and move the temporary register into it */
+-                    neg_op = INDEX_op_neg_vec;
-+        if (!tcg_regset_test_reg(arg_ct->regs, reg)) {
+-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-+ allocate_in_reg:
+-                } else {
-+            /*
+                     break;
-+             * Allocate a new register matching the constraint
++                case TCG_TYPE_V64:
-+             * and move the temporary register into it.
++                case TCG_TYPE_V128:
-+             */
++                case TCG_TYPE_V256:
-             temp_load(s, ts, tcg_target_available_regs[ts->type],
++                    neg_op = INDEX_op_neg_vec;
-                       i_allocated_regs, 0);
++                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-             reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
++                                                   TCGOP_VECE(op)) > 0;
-@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
++                    break;
-     }
++                default:
- #endif
++                    g_assert_not_reached();
+                 }
-+    for (i = 0; i < TCG_TYPE_COUNT; ++i) {
+                 if (!have_neg) {
-+        if (s->const_table[i]) {
+                     break;
-+            g_hash_table_destroy(s->const_table[i]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+            s->const_table[i] = NULL;
+                 TCGOpcode not_op;
-+        }
+                 bool have_not;
-+    }
-+
+-                if (def->flags & TCG_OPF_VECTOR) {
-     tcg_reg_alloc_start(s);
+-                    not_op = INDEX_op_not_vec;
+-                    have_not = TCG_TARGET_HAS_not_vec;
-     s->code_buf = tb->tc.ptr;
+-                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-[PATCH 07/43] tcg/i386: Fix dupi for avx2 32-bit hosts
+[PULL 39/56] tcg/optimize: Split out fold_to_not
-The previous change wrongly stated that 32-bit avx2 should have
+Split out the conditional conversion from a more complex logical
-used VPBROADCASTW.  But that's a 16-bit broadcast and we want a
+operation to a simple NOT.  Create a couple more helpers to make
--bit broadcast.
+this easy for the outer-most logical operations.
-Fixes: 7b60ef3264e
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/i386/tcg-target.c.inc | 2 +-
+ tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 86 insertions(+), 72 deletions(-)
-diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/i386/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/i386/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
+     return false;
-     } else {
+ }
-         if (have_avx2) {
--            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
++/*
-+            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
++ * Convert @op to NOT, if NOT is supported by the host.
-         } else {
++ * Return true f the conversion is successful, which will still
-             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
++ * indicate that the processing is complete.
 + */
 +static bool fold_not(OptContext *ctx, TCGOp *op);
 +static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
 +{
 +    TCGOpcode not_op;
 +    bool have_not;
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        not_op = INDEX_op_not_i32;
 +        have_not = TCG_TARGET_HAS_not_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        not_op = INDEX_op_not_i64;
 +        have_not = TCG_TARGET_HAS_not_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        not_op = INDEX_op_not_vec;
 +        have_not = TCG_TARGET_HAS_not_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              }
              break;
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(nand):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64(nor):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(andc):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == -1) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        try_not:
 -            {
 -                TCGOpcode not_op;
 -                bool have_not;
 -
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    not_op = INDEX_op_not_i32;
 -                    have_not = TCG_TARGET_HAS_not_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
          }
 --
 .25.1

-[PATCH 11/43] tcg: Use tcg_out_dupi_vec from temp_load
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
-Having dupi pass though movi is confusing and arguably wrong.
+Even though there is only one user, place this more complex
 conversion into its own helper.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c                    |  6 +++-
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
- tcg/aarch64/tcg-target.c.inc |  7 ----
+file changed, 47 insertions(+), 42 deletions(-)
  tcg/i386/tcg-target.c.inc    | 63 ++++++++++++++++++++++++------------
  tcg/ppc/tcg-target.c.inc     |  6 ----
 files changed, 47 insertions(+), 35 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
-     case TEMP_VAL_CONST:
-         reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
-                             preferred_regs, ts->indirect_base);
+ {
--        tcg_out_movi(s, ts->type, reg, ts->val);
+-    return fold_const1(ctx, op);
-+        if (ts->type <= TCG_TYPE_I64) {
++    if (fold_const1(ctx, op)) {
-+            tcg_out_movi(s, ts->type, reg, ts->val);
++        return true;
-+        } else {
++    }
-+            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
++    /*
-+        }
++     * Because of fold_sub_to_neg, we want to always return true,
-         ts->mem_coherent = 0;
++     * via finish_folding.
-         break;
++     */
-     case TEMP_VAL_MEM:
++    finish_folding(ctx, op);
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
++    return true;
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
      case TCG_TYPE_I64:
          tcg_debug_assert(rd < 32);
          break;
 -
 -    case TCG_TYPE_V64:
 -    case TCG_TYPE_V128:
 -        tcg_debug_assert(rd >= 32);
 -        tcg_out_dupi_vec(s, type, rd, value);
 -        return;
 -
      default:
          g_assert_not_reached();
      }
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
      }
  }
--static void tcg_out_movi(TCGContext *s, TCGType type,
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
--                         TCGReg ret, tcg_target_long arg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-+static void tcg_out_movi_vec(TCGContext *s, TCGType type,
+     return fold_const2(ctx, op);
-+                             TCGReg ret, tcg_target_long arg)
+ }
 +static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
 +{
-+    if (arg == 0) {
++    TCGOpcode neg_op;
-+        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
++    bool have_neg;
-+        return;
++
-+    }
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
-+    if (arg == -1) {
++        return false;
 +        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 +        return;
 +    }
 +
-+    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
++    switch (ctx->type) {
 +    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
 +    if (TCG_TARGET_REG_BITS == 64) {
 +        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 +    } else {
 +        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 +    }
 +}
 +
 +static void tcg_out_movi_int(TCGContext *s, TCGType type,
 +                             TCGReg ret, tcg_target_long arg)
  {
      tcg_target_long diff;
 -    switch (type) {
 -    case TCG_TYPE_I32:
 -#if TCG_TARGET_REG_BITS == 64
 -    case TCG_TYPE_I64:
 -#endif
 -        if (ret < 16) {
 -            break;
 -        }
 -        /* fallthru */
 -    case TCG_TYPE_V64:
 -    case TCG_TYPE_V128:
 -    case TCG_TYPE_V256:
 -        tcg_debug_assert(ret >= 16);
 -        tcg_out_dupi_vec(s, type, ret, arg);
 -        return;
 -    default:
 -        g_assert_not_reached();
 -    }
 -
      if (arg == 0) {
          tgen_arithr(s, ARITH_XOR, ret, ret);
          return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
      tcg_out64(s, arg);
  }
 +static void tcg_out_movi(TCGContext *s, TCGType type,
 +                         TCGReg ret, tcg_target_long arg)
 +{
 +    switch (type) {
 +    case TCG_TYPE_I32:
-+#if TCG_TARGET_REG_BITS == 64
++        neg_op = INDEX_op_neg_i32;
 +        have_neg = TCG_TARGET_HAS_neg_i32;
 +        break;
 +    case TCG_TYPE_I64:
-+#endif
++        neg_op = INDEX_op_neg_i64;
-+        if (ret < 16) {
++        have_neg = TCG_TARGET_HAS_neg_i64;
-+            tcg_out_movi_int(s, type, ret, arg);
++        break;
-+        } else {
++    case TCG_TYPE_V64:
-+            tcg_out_movi_vec(s, type, ret, arg);
++    case TCG_TYPE_V128:
-+        }
++    case TCG_TYPE_V256:
 +        neg_op = INDEX_op_neg_vec;
 +        have_neg = (TCG_TARGET_HAS_neg_vec &&
 +                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
++    if (have_neg) {
++        op->opc = neg_op;
++        op->args[1] = op->args[2];
++        return fold_neg(ctx, op);
++    }
++    return false;
 +}
 +
- static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
-     if (val == (int8_t)val) {
+     if (fold_const2(ctx, op) ||
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+-        fold_xx_to_i(ctx, op, 0)) {
-index XXXXXXX..XXXXXXX 100644
++        fold_xx_to_i(ctx, op, 0) ||
---- a/tcg/ppc/tcg-target.c.inc
++        fold_sub_to_neg(ctx, op)) {
-+++ b/tcg/ppc/tcg-target.c.inc
+         return true;
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
+     }
-         tcg_out_movi_int(s, type, ret, arg, false);
+     return false;
-         break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 continue;
--    case TCG_TYPE_V64:
+             }
--    case TCG_TYPE_V128:
+             break;
--        tcg_debug_assert(ret >= TCG_REG_V0);
+-        CASE_OP_32_64_VEC(sub):
--        tcg_out_dupi_vec(s, type, ret, arg);
+-            {
--        break;
+-                TCGOpcode neg_op;
 -                bool have_neg;
 -
-     default:
+-                if (arg_is_const(op->args[2])) {
-         g_assert_not_reached();
+-                    /* Proceed with possible constant folding. */
-     }
+-                    break;
 -                }
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    neg_op = INDEX_op_neg_i32;
 -                    have_neg = TCG_TARGET_HAS_neg_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    neg_op = INDEX_op_neg_i64;
 -                    have_neg = TCG_TARGET_HAS_neg_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 -                                                   TCGOP_VECE(op)) > 0;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_neg) {
 -                    break;
 -                }
 -                if (arg_is_const(op->args[1])
 -                    && arg_info(op->args[1])->val == 0) {
 -                    op->opc = neg_op;
 -                    reset_temp(op->args[0]);
 -                    op->args[1] = op->args[2];
 -                    continue;
 -                }
 -            }
 -            break;
          default:
              break;
          }
 --
 .25.1

-[PATCH 36/43] tcg/aarch64: Convert to tcg-constr.c.inc
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
+Pull the "op r, a, i => mov r, a" optimization into a function,
+and use them in the outer-most logical operations.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target-constr.h | 31 ++++++++++++
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
- tcg/aarch64/tcg-target.c.inc    | 85 +++++++++++----------------------
+file changed, 26 insertions(+), 35 deletions(-)
 files changed, 60 insertions(+), 56 deletions(-)
  create mode 100644 tcg/aarch64/tcg-target-constr.h
-diff --git a/tcg/aarch64/tcg-target-constr.h b/tcg/aarch64/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
-new file mode 100644
+index XXXXXXX..XXXXXXX 100644
-index XXXXXXX..XXXXXXX
+--- a/tcg/optimize.c
---- /dev/null
++++ b/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target-constr.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-@@ -XXX,XX +XXX,XX @@
+     return false;
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
+ }
-+/*
-+ * AArch64 target-specific operand constaints.
++/* If the binary operation has second argument @i, fold to identity. */
-+ * Copyright (c) 2020 Linaro
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
-+ */
++{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
-+C_O0_I1(r)
+ /* If the binary operation has second argument @i, fold to NOT. */
-+C_O0_I2(lZ, l)
+ static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
-+C_O0_I2(r, rA)
+ {
-+C_O0_I2(rZ, r)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
-+C_O0_I2(w, r)
-+C_O1_I1(r, l)
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-+C_O1_I1(r, r)
+ {
-+C_O1_I1(w, r)
+-    return fold_const2(ctx, op);
-+C_O1_I1(w, w)
++    if (fold_const2(ctx, op) ||
-+C_O1_I1(w, wr)
++        fold_xi_to_x(ctx, op, 0)) {
-+C_O1_I2(r, 0, rZ)
++        return true;
-+C_O1_I2(r, r, r)
++    }
-+C_O1_I2(r, r, rA)
++    return false;
 +C_O1_I2(r, r, rAL)
 +C_O1_I2(r, r, ri)
 +C_O1_I2(r, r, rL)
 +C_O1_I2(r, rZ, rZ)
 +C_O1_I2(w, 0, w)
 +C_O1_I2(w, w, w)
 +C_O1_I2(w, w, wN)
 +C_O1_I2(w, w, wO)
 +C_O1_I2(w, w, wZ)
 +C_O1_I3(w, w, w, w)
 +C_O1_I4(r, r, rA, rZ, rZ)
 +C_O2_I4(r, r, rZ, rZ, rA, rMZ)
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
      va_end(va);
  }
-+/* Define all constraint sets. */
+ static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
-+#include "../tcg-constr.c.inc"
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 +
  static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
  {
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
+     if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+         fold_xi_to_i(ctx, op, 0) ||
--    static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } };
++        fold_xi_to_x(ctx, op, -1) ||
--    static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } };
+         fold_xx_to_x(ctx, op)) {
--    static const TCGTargetOpDef w_wr = { .args_ct_str = { "w", "wr" } };
+         return true;
--    static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
+     }
--    static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } };
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
+ {
--    static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } };
+     if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
+         fold_xx_to_i(ctx, op, 0) ||
--    static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } };
++        fold_xi_to_x(ctx, op, 0) ||
--    static const TCGTargetOpDef w_0_w = { .args_ct_str = { "w", "0", "w" } };
+         fold_ix_to_not(ctx, op, -1)) {
--    static const TCGTargetOpDef w_w_wO = { .args_ct_str = { "w", "w", "wO" } };
+         return true;
--    static const TCGTargetOpDef w_w_wN = { .args_ct_str = { "w", "w", "wN" } };
+     }
--    static const TCGTargetOpDef w_w_wZ = { .args_ct_str = { "w", "w", "wZ" } };
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } };
+ {
--    static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } };
+     if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef r_r_rAL
++        fold_xi_to_x(ctx, op, -1) ||
--        = { .args_ct_str = { "r", "r", "rAL" } };
+         fold_xi_to_not(ctx, op, 0)) {
--    static const TCGTargetOpDef dep
+         return true;
--        = { .args_ct_str = { "r", "0", "rZ" } };
+     }
--    static const TCGTargetOpDef ext2
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
--        = { .args_ct_str = { "r", "rZ", "rZ" } };
+ static bool fold_or(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef movc
+ {
--        = { .args_ct_str = { "r", "r", "rA", "rZ", "rZ" } };
+     if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef add2
++        fold_xi_to_x(ctx, op, 0) ||
--        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rA", "rMZ" } };
+         fold_xx_to_x(ctx, op)) {
--    static const TCGTargetOpDef w_w_w_w
+         return true;
--        = { .args_ct_str = { "w", "w", "w", "w" } };
+     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_sub_to_neg(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expression for "op r, a, const => mov r, a" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(sub):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64_VEC(andc):
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
-     switch (op) {
+         /* Simplify using known-zero bits. Currently only ops with a single
-     case INDEX_op_goto_ptr:
+            output argument is supported. */
--        return &r;
+         z_mask = -1;
 +        return C_O0_I1(r);
      case INDEX_op_ld8u_i32:
      case INDEX_op_ld8s_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_extract_i64:
      case INDEX_op_sextract_i32:
      case INDEX_op_sextract_i64:
 -        return &r_r;
 +        return C_O1_I1(r, r);
      case INDEX_op_st8_i32:
      case INDEX_op_st16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_st16_i64:
      case INDEX_op_st32_i64:
      case INDEX_op_st_i64:
 -        return &rZ_r;
 +        return C_O0_I2(rZ, r);
      case INDEX_op_add_i32:
      case INDEX_op_add_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_sub_i64:
      case INDEX_op_setcond_i32:
      case INDEX_op_setcond_i64:
 -        return &r_r_rA;
 +        return C_O1_I2(r, r, rA);
      case INDEX_op_mul_i32:
      case INDEX_op_mul_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_remu_i64:
      case INDEX_op_muluh_i64:
      case INDEX_op_mulsh_i64:
 -        return &r_r_r;
 +        return C_O1_I2(r, r, r);
      case INDEX_op_and_i32:
      case INDEX_op_and_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_orc_i64:
      case INDEX_op_eqv_i32:
      case INDEX_op_eqv_i64:
 -        return &r_r_rL;
 +        return C_O1_I2(r, r, rL);
      case INDEX_op_shl_i32:
      case INDEX_op_shr_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_sar_i64:
      case INDEX_op_rotl_i64:
      case INDEX_op_rotr_i64:
 -        return &r_r_ri;
 +        return C_O1_I2(r, r, ri);
      case INDEX_op_clz_i32:
      case INDEX_op_ctz_i32:
      case INDEX_op_clz_i64:
      case INDEX_op_ctz_i64:
 -        return &r_r_rAL;
 +        return C_O1_I2(r, r, rAL);
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
 -        return &r_rA;
 +        return C_O0_I2(r, rA);
      case INDEX_op_movcond_i32:
      case INDEX_op_movcond_i64:
 -        return &movc;
 +        return C_O1_I4(r, r, rA, rZ, rZ);
      case INDEX_op_qemu_ld_i32:
      case INDEX_op_qemu_ld_i64:
 -        return &r_l;
 +        return C_O1_I1(r, l);
      case INDEX_op_qemu_st_i32:
      case INDEX_op_qemu_st_i64:
 -        return &lZ_l;
 +        return C_O0_I2(lZ, l);
      case INDEX_op_deposit_i32:
      case INDEX_op_deposit_i64:
 -        return &dep;
 +        return C_O1_I2(r, 0, rZ);
      case INDEX_op_extract2_i32:
      case INDEX_op_extract2_i64:
 -        return &ext2;
 +        return C_O1_I2(r, rZ, rZ);
      case INDEX_op_add2_i32:
      case INDEX_op_add2_i64:
      case INDEX_op_sub2_i32:
      case INDEX_op_sub2_i64:
 -        return &add2;
 +        return C_O2_I4(r, r, rZ, rZ, rA, rMZ);
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_shrv_vec:
      case INDEX_op_sarv_vec:
      case INDEX_op_aa64_sshl_vec:
 -        return &w_w_w;
 +        return C_O1_I2(w, w, w);
      case INDEX_op_not_vec:
      case INDEX_op_neg_vec:
      case INDEX_op_abs_vec:
      case INDEX_op_shli_vec:
      case INDEX_op_shri_vec:
      case INDEX_op_sari_vec:
 -        return &w_w;
 +        return C_O1_I1(w, w);
      case INDEX_op_ld_vec:
 -    case INDEX_op_st_vec:
      case INDEX_op_dupm_vec:
 -        return &w_r;
 +        return C_O1_I1(w, r);
 +    case INDEX_op_st_vec:
 +        return C_O0_I2(w, r);
      case INDEX_op_dup_vec:
 -        return &w_wr;
 +        return C_O1_I1(w, wr);
      case INDEX_op_or_vec:
      case INDEX_op_andc_vec:
 -        return &w_w_wO;
 +        return C_O1_I2(w, w, wO);
      case INDEX_op_and_vec:
      case INDEX_op_orc_vec:
 -        return &w_w_wN;
 +        return C_O1_I2(w, w, wN);
      case INDEX_op_cmp_vec:
 -        return &w_w_wZ;
 +        return C_O1_I2(w, w, wZ);
      case INDEX_op_bitsel_vec:
 -        return &w_w_w_w;
 +        return C_O1_I3(w, w, w, w);
      case INDEX_op_aa64_sli_vec:
 -        return &w_0_w;
 +        return C_O1_I2(w, 0, w);
      default:
          return NULL;
 --
 .25.1

-[PATCH 05/43] tcg: Move some TCG_CT_* bits to TCGArgConstraint bitfields
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
-These are easier to set and test when they have their own fields.
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
-Reduce the size of alias_index and sort_index to 4 bits, which is
+and use it in fold_shift.
 sufficient for TCG_MAX_OP_ARGS.  This leaves only the bits indicating
 constants within the ct field.
-Move all initialization to allocation time, rather than init
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-individual fields in process_op_defs.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h | 14 +++++++-------
+ tcg/optimize.c | 28 ++++++++++------------------
- tcg/tcg.c         | 28 ++++++++++++----------------
+file changed, 10 insertions(+), 18 deletions(-)
 files changed, 19 insertions(+), 23 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ int64_t tcg_cpu_exec_time(void);
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
- void tcg_dump_info(void);
+     return false;
- void tcg_dump_op_count(void);
+ }
--#define TCG_CT_ALIAS  0x80
++/* If the binary operation has first argument @i, fold to @i. */
--#define TCG_CT_IALIAS 0x40
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
--#define TCG_CT_NEWREG 0x20 /* output requires a new register */
++{
--#define TCG_CT_CONST  0x02 /* any constant of register size */
++    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
-+#define TCG_CT_CONST  1 /* any constant of register size */
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
- typedef struct TCGArgConstraint {
++    return false;
--    uint16_t ct;
++}
--    uint8_t alias_index;
++
--    uint8_t sort_index;
+ /* If the binary operation has first argument @i, fold to NOT. */
-+    unsigned ct : 16;
+ static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
-+    unsigned alias_index : 4;
+ {
-+    unsigned sort_index : 4;
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-+    bool oalias : 1;
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
-+    bool ialias : 1;
+ {
-+    bool newreg : 1;
+     if (fold_const2(ctx, op) ||
-     TCGRegSet regs;
++        fold_ix_to_i(ctx, op, 0) ||
- } TCGArgConstraint;
+         fold_xi_to_x(ctx, op, 0)) {
+         return true;
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_context_init(TCGContext *s)
          total_args += n;
      }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    args_ct = g_malloc(sizeof(TCGArgConstraint) * total_args);
+             break;
 +    args_ct = g_new0(TCGArgConstraint, total_args);
      for(op = 0; op < NB_OPS; op++) {
          def = &tcg_op_defs[op];
@@ -XXX,XX +XXX,XX @@ static int get_constraint_priority(const TCGOpDef *def, int k)
      const TCGArgConstraint *arg_ct = &def->args_ct[k];
      int n;
 -    if (arg_ct->ct & TCG_CT_ALIAS) {
 +    if (arg_ct->oalias) {
          /* an alias is equivalent to a single register */
          n = 1;
      } else {
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
              /* Incomplete TCGTargetOpDef entry. */
              tcg_debug_assert(ct_str != NULL);
 -            def->args_ct[i].regs = 0;
 -            def->args_ct[i].ct = 0;
              while (*ct_str != '\0') {
                  switch(*ct_str) {
                  case '0' ... '9':
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
                          tcg_debug_assert(ct_str == tdefs->args_ct_str[i]);
                          tcg_debug_assert(oarg < def->nb_oargs);
                          tcg_debug_assert(def->args_ct[oarg].regs != 0);
 -                        /* TCG_CT_ALIAS is for the output arguments.
 -                           The input is tagged with TCG_CT_IALIAS. */
                          def->args_ct[i] = def->args_ct[oarg];
 -                        def->args_ct[oarg].ct |= TCG_CT_ALIAS;
 +                        /* The output sets oalias.  */
 +                        def->args_ct[oarg].oalias = true;
                          def->args_ct[oarg].alias_index = i;
 -                        def->args_ct[i].ct |= TCG_CT_IALIAS;
 +                        /* The input sets ialias. */
 +                        def->args_ct[i].ialias = true;
                          def->args_ct[i].alias_index = oarg;
                      }
                      ct_str++;
                      break;
                  case '&':
 -                    def->args_ct[i].ct |= TCG_CT_NEWREG;
 +                    def->args_ct[i].newreg = true;
                      ct_str++;
                      break;
                  case 'i':
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
                      set = *pset;
                      set &= ct->regs;
 -                    if (ct->ct & TCG_CT_IALIAS) {
 +                    if (ct->ialias) {
                          set &= op->output_pref[ct->alias_index];
                      }
                      /* If the combination is not possible, restart.  */
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          }
-         i_preferred_regs = o_preferred_regs = 0;
+-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
--        if (arg_ct->ct & TCG_CT_IALIAS) {
+-           and "sub r, 0, a => neg r, a" case.  */
-+        if (arg_ct->ialias) {
+-        switch (opc) {
-             o_preferred_regs = op->output_pref[arg_ct->alias_index];
+-        CASE_OP_32_64(shl):
-             if (ts->fixed_reg) {
+-        CASE_OP_32_64(shr):
-                 /* if fixed register, we must allocate a new register
+-        CASE_OP_32_64(sar):
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+-        CASE_OP_32_64(rotl):
-                     reg = ts->reg;
+-        CASE_OP_32_64(rotr):
-                     for (k2 = 0 ; k2 < k ; k2++) {
+-            if (arg_is_const(op->args[1])
-                         i2 = def->args_ct[nb_oargs + k2].sort_index;
+-                && arg_info(op->args[1])->val == 0) {
--                        if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
--                            reg == new_args[i2]) {
+-                continue;
-+                        if (def->args_ct[i2].ialias && reg == new_args[i2]) {
+-            }
-                             goto allocate_in_reg;
+-            break;
-                         }
+-        default:
-                     }
+-            break;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+-        }
-             /* ENV should not be modified.  */
+-
-             tcg_debug_assert(!ts->fixed_reg);
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
--            if ((arg_ct->ct & TCG_CT_ALIAS)
+         z_mask = -1;
 -                && !const_args[arg_ct->alias_index]) {
 +            if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                  reg = new_args[arg_ct->alias_index];
 -            } else if (arg_ct->ct & TCG_CT_NEWREG) {
 +            } else if (arg_ct->newreg) {
                  reg = tcg_reg_alloc(s, arg_ct->regs,
                                      i_allocated_regs | o_allocated_regs,
                                      op->output_pref[k], ts->indirect_base);
 --
 .25.1

-[PATCH 43/43] tcg/tci: Convert to tcg-constr.c.inc
+[PULL 43/56] tcg/optimize: Split out fold_masks
-This does require finishing the conversion to tcg_target_op_def.
+Move all of the known-zero optimizations into the per-opcode
-Remove quite a lot of ifdefs, since we can reference opcodes
+functions.  Use fold_masks when there is a possibility of the
-even if they are not implemented.
+result being determined, and simply set ctx->z_mask otherwise.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target-constr.h |  28 +++
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
- tcg/tci/tcg-target.c.inc    | 360 ++++++++++++++----------------------
+file changed, 294 insertions(+), 251 deletions(-)
 files changed, 163 insertions(+), 225 deletions(-)
  create mode 100644 tcg/tci/tcg-target-constr.h
-diff --git a/tcg/tci/tcg-target-constr.h b/tcg/tci/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tcg/tci/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
 +/* SPDX-License-Identifier: GPL-2.0-or-later */
 +/*
 + * TCI target-specific operand constaints.
 + * Copyright (c) 2020 Linaro
 + */
 +
 +C_O0_I2(r, r)
 +C_O0_I2(r, ri)
 +C_O0_I2(r, S)
 +C_O0_I3(r, r, S)
 +C_O0_I3(r, S, S)
 +C_O0_I4(r, r, S, S)
 +C_O1_I1(r, L)
 +C_O1_I1(r, r)
 +C_O1_I2(r, 0, r)
 +C_O1_I2(r, L, L)
 +C_O1_I2(r, ri, ri)
 +C_O1_I2(r, r, r)
 +C_O1_I2(r, r, ri)
 +C_O2_I1(r, r, L)
 +C_O2_I2(r, r, L, L)
 +
 +#if TCG_TARGET_REG_BITS == 32
 +C_O0_I4(r, r, ri, ri)
 +C_O1_I4(r, r, r, ri, ri)
 +C_O2_I2(r, r, r, r)
 +C_O2_I4(r, r, r, r, r, r)
 +#endif
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
- /* Bitfield n...m (in 32 bit value). */
+     TCGTempSet temps_used;
- #define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
+     /* In flight values from optimization. */
--/* Macros used in tcg_target_op_defs. */
+-    uint64_t z_mask;
--#define R       "r"
++    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
--#define RI      "ri"
++    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
--#if TCG_TARGET_REG_BITS == 32
+     TCGType type;
--# define R64    "r", "r"
+ } OptContext;
--#else
--# define R64    "r"
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
--#endif
+     return false;
--#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
+ }
--# define L      "L", "L"
--# define S      "S", "S"
++static bool fold_masks(OptContext *ctx, TCGOp *op)
--#else
++{
--# define L      "L"
++    uint64_t a_mask = ctx->a_mask;
--# define S      "S"
++    uint64_t z_mask = ctx->z_mask;
--#endif
++
--
++    /*
--/* TODO: documentation. */
++     * 32-bit ops generate 32-bit results.  For the result is zero test
--static const TCGTargetOpDef tcg_target_op_defs[] = {
++     * below, we can ignore high bits, but for further optimizations we
--    { INDEX_op_exit_tb, { NULL } },
++     * need to record that the high bits contain garbage.
--    { INDEX_op_goto_tb, { NULL } },
++     */
--    { INDEX_op_br, { NULL } },
++    if (ctx->type == TCG_TYPE_I32) {
--
++        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
--    { INDEX_op_ld8u_i32, { R, R } },
++        a_mask &= MAKE_64BIT_MASK(0, 32);
--    { INDEX_op_ld8s_i32, { R, R } },
++        z_mask &= MAKE_64BIT_MASK(0, 32);
--    { INDEX_op_ld16u_i32, { R, R } },
++    }
--    { INDEX_op_ld16s_i32, { R, R } },
++
--    { INDEX_op_ld_i32, { R, R } },
++    if (z_mask == 0) {
--    { INDEX_op_st8_i32, { R, R } },
++        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
--    { INDEX_op_st16_i32, { R, R } },
++    }
--    { INDEX_op_st_i32, { R, R } },
++    if (a_mask == 0) {
--
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
--    { INDEX_op_add_i32, { R, RI, RI } },
++    }
--    { INDEX_op_sub_i32, { R, RI, RI } },
++    return false;
--    { INDEX_op_mul_i32, { R, RI, RI } },
++}
--#if TCG_TARGET_HAS_div_i32
++
--    { INDEX_op_div_i32, { R, R, R } },
+ /*
--    { INDEX_op_divu_i32, { R, R, R } },
+  * Convert @op to NOT, if NOT is supported by the host.
--    { INDEX_op_rem_i32, { R, R, R } },
+  * Return true f the conversion is successful, which will still
--    { INDEX_op_remu_i32, { R, R, R } },
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
--#elif TCG_TARGET_HAS_div2_i32
--    { INDEX_op_div2_i32, { R, R, "0", "1", R } },
+ static bool fold_and(OptContext *ctx, TCGOp *op)
--    { INDEX_op_divu2_i32, { R, R, "0", "1", R } },
+ {
--#endif
++    uint64_t z1, z2;
--    /* TODO: Does R, RI, RI result in faster code than R, R, RI?
++
--       If both operands are constants, we can optimize. */
+     if (fold_const2(ctx, op) ||
--    { INDEX_op_and_i32, { R, RI, RI } },
+         fold_xi_to_i(ctx, op, 0) ||
--#if TCG_TARGET_HAS_andc_i32
+         fold_xi_to_x(ctx, op, -1) ||
--    { INDEX_op_andc_i32, { R, RI, RI } },
+         fold_xx_to_x(ctx, op)) {
--#endif
+         return true;
--#if TCG_TARGET_HAS_eqv_i32
+     }
--    { INDEX_op_eqv_i32, { R, RI, RI } },
+-    return false;
--#endif
++
--#if TCG_TARGET_HAS_nand_i32
++    z1 = arg_info(op->args[1])->z_mask;
--    { INDEX_op_nand_i32, { R, RI, RI } },
++    z2 = arg_info(op->args[2])->z_mask;
--#endif
++    ctx->z_mask = z1 & z2;
--#if TCG_TARGET_HAS_nor_i32
++
--    { INDEX_op_nor_i32, { R, RI, RI } },
++    /*
--#endif
++     * Known-zeros does not imply known-ones.  Therefore unless
--    { INDEX_op_or_i32, { R, RI, RI } },
++     * arg2 is constant, we can't infer affected bits from it.
--#if TCG_TARGET_HAS_orc_i32
++     */
--    { INDEX_op_orc_i32, { R, RI, RI } },
++    if (arg_is_const(op->args[2])) {
--#endif
++        ctx->a_mask = z1 & ~z2;
--    { INDEX_op_xor_i32, { R, RI, RI } },
++    }
--    { INDEX_op_shl_i32, { R, RI, RI } },
++
--    { INDEX_op_shr_i32, { R, RI, RI } },
++    return fold_masks(ctx, op);
--    { INDEX_op_sar_i32, { R, RI, RI } },
+ }
--#if TCG_TARGET_HAS_rot_i32
--    { INDEX_op_rotl_i32, { R, RI, RI } },
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
--    { INDEX_op_rotr_i32, { R, RI, RI } },
+ {
--#endif
++    uint64_t z1;
--#if TCG_TARGET_HAS_deposit_i32
++
--    { INDEX_op_deposit_i32, { R, "0", R } },
+     if (fold_const2(ctx, op) ||
--#endif
+         fold_xx_to_i(ctx, op, 0) ||
--
+         fold_xi_to_x(ctx, op, 0) ||
--    { INDEX_op_brcond_i32, { R, RI } },
+         fold_ix_to_not(ctx, op, -1)) {
--
+         return true;
--    { INDEX_op_setcond_i32, { R, R, RI } },
+     }
--#if TCG_TARGET_REG_BITS == 64
+-    return false;
--    { INDEX_op_setcond_i64, { R, R, RI } },
++
--#endif /* TCG_TARGET_REG_BITS == 64 */
++    z1 = arg_info(op->args[1])->z_mask;
--
++
--#if TCG_TARGET_REG_BITS == 32
++    /*
--    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
++     * Known-zeros does not imply known-ones.  Therefore unless
--    { INDEX_op_add2_i32, { R, R, R, R, R, R } },
++     * arg2 is constant, we can't infer anything from it.
--    { INDEX_op_sub2_i32, { R, R, R, R, R, R } },
++     */
--    { INDEX_op_brcond2_i32, { R, R, RI, RI } },
++    if (arg_is_const(op->args[2])) {
--    { INDEX_op_mulu2_i32, { R, R, R, R } },
++        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
--    { INDEX_op_setcond2_i32, { R, R, R, RI, RI } },
++        ctx->a_mask = z1 & ~z2;
--#endif
++        z1 &= z2;
--
++    }
--#if TCG_TARGET_HAS_not_i32
++    ctx->z_mask = z1;
--    { INDEX_op_not_i32, { R, R } },
++
--#endif
++    return fold_masks(ctx, op);
--#if TCG_TARGET_HAS_neg_i32
+ }
--    { INDEX_op_neg_i32, { R, R } },
--#endif
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
--
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
--#if TCG_TARGET_REG_BITS == 64
--    { INDEX_op_ld8u_i64, { R, R } },
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
--    { INDEX_op_ld8s_i64, { R, R } },
+ {
--    { INDEX_op_ld16u_i64, { R, R } },
++    uint64_t z_mask, sign;
--    { INDEX_op_ld16s_i64, { R, R } },
++
--    { INDEX_op_ld32u_i64, { R, R } },
+     if (arg_is_const(op->args[1])) {
--    { INDEX_op_ld32s_i64, { R, R } },
+         uint64_t t = arg_info(op->args[1])->val;
--    { INDEX_op_ld_i64, { R, R } },
--
+         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
--    { INDEX_op_st8_i64, { R, R } },
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
--    { INDEX_op_st16_i64, { R, R } },
+     }
--    { INDEX_op_st32_i64, { R, R } },
+-    return false;
--    { INDEX_op_st_i64, { R, R } },
++
--
++    z_mask = arg_info(op->args[1])->z_mask;
--    { INDEX_op_add_i64, { R, RI, RI } },
++    switch (op->opc) {
 -    { INDEX_op_sub_i64, { R, RI, RI } },
 -    { INDEX_op_mul_i64, { R, RI, RI } },
 -#if TCG_TARGET_HAS_div_i64
 -    { INDEX_op_div_i64, { R, R, R } },
 -    { INDEX_op_divu_i64, { R, R, R } },
 -    { INDEX_op_rem_i64, { R, R, R } },
 -    { INDEX_op_remu_i64, { R, R, R } },
 -#elif TCG_TARGET_HAS_div2_i64
 -    { INDEX_op_div2_i64, { R, R, "0", "1", R } },
 -    { INDEX_op_divu2_i64, { R, R, "0", "1", R } },
 -#endif
 -    { INDEX_op_and_i64, { R, RI, RI } },
 -#if TCG_TARGET_HAS_andc_i64
 -    { INDEX_op_andc_i64, { R, RI, RI } },
 -#endif
 -#if TCG_TARGET_HAS_eqv_i64
 -    { INDEX_op_eqv_i64, { R, RI, RI } },
 -#endif
 -#if TCG_TARGET_HAS_nand_i64
 -    { INDEX_op_nand_i64, { R, RI, RI } },
 -#endif
 -#if TCG_TARGET_HAS_nor_i64
 -    { INDEX_op_nor_i64, { R, RI, RI } },
 -#endif
 -    { INDEX_op_or_i64, { R, RI, RI } },
 -#if TCG_TARGET_HAS_orc_i64
 -    { INDEX_op_orc_i64, { R, RI, RI } },
 -#endif
 -    { INDEX_op_xor_i64, { R, RI, RI } },
 -    { INDEX_op_shl_i64, { R, RI, RI } },
 -    { INDEX_op_shr_i64, { R, RI, RI } },
 -    { INDEX_op_sar_i64, { R, RI, RI } },
 -#if TCG_TARGET_HAS_rot_i64
 -    { INDEX_op_rotl_i64, { R, RI, RI } },
 -    { INDEX_op_rotr_i64, { R, RI, RI } },
 -#endif
 -#if TCG_TARGET_HAS_deposit_i64
 -    { INDEX_op_deposit_i64, { R, "0", R } },
 -#endif
 -    { INDEX_op_brcond_i64, { R, RI } },
 -
 -#if TCG_TARGET_HAS_ext8s_i64
 -    { INDEX_op_ext8s_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext16s_i64
 -    { INDEX_op_ext16s_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext32s_i64
 -    { INDEX_op_ext32s_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext8u_i64
 -    { INDEX_op_ext8u_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext16u_i64
 -    { INDEX_op_ext16u_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext32u_i64
 -    { INDEX_op_ext32u_i64, { R, R } },
 -#endif
 -    { INDEX_op_ext_i32_i64, { R, R } },
 -    { INDEX_op_extu_i32_i64, { R, R } },
 -#if TCG_TARGET_HAS_bswap16_i64
 -    { INDEX_op_bswap16_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_bswap32_i64
 -    { INDEX_op_bswap32_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_bswap64_i64
 -    { INDEX_op_bswap64_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_not_i64
 -    { INDEX_op_not_i64, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_neg_i64
 -    { INDEX_op_neg_i64, { R, R } },
 -#endif
 -#endif /* TCG_TARGET_REG_BITS == 64 */
 -
 -    { INDEX_op_qemu_ld_i32, { R, L } },
 -    { INDEX_op_qemu_ld_i64, { R64, L } },
 -
 -    { INDEX_op_qemu_st_i32, { R, S } },
 -    { INDEX_op_qemu_st_i64, { R64, S } },
 -
 -#if TCG_TARGET_HAS_ext8s_i32
 -    { INDEX_op_ext8s_i32, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext16s_i32
 -    { INDEX_op_ext16s_i32, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext8u_i32
 -    { INDEX_op_ext8u_i32, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_ext16u_i32
 -    { INDEX_op_ext16u_i32, { R, R } },
 -#endif
 -
 -#if TCG_TARGET_HAS_bswap16_i32
 -    { INDEX_op_bswap16_i32, { R, R } },
 -#endif
 -#if TCG_TARGET_HAS_bswap32_i32
 -    { INDEX_op_bswap32_i32, { R, R } },
 -#endif
 -
 -    { INDEX_op_mb, { } },
 -    { -1 },
 -};
 +/* Define all constraint sets. */
 +#include "../tcg-constr.c.inc"
  static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
  {
 -    int i, n = ARRAY_SIZE(tcg_target_op_defs);
 +    switch (op) {
 +    case INDEX_op_ld8u_i32:
 +    case INDEX_op_ld8s_i32:
 +    case INDEX_op_ld16u_i32:
 +    case INDEX_op_ld16s_i32:
 +    case INDEX_op_ld_i32:
 +    case INDEX_op_ld8u_i64:
 +    case INDEX_op_ld8s_i64:
 +    case INDEX_op_ld16u_i64:
 +    case INDEX_op_ld16s_i64:
 +    case INDEX_op_ld32u_i64:
 +    case INDEX_op_ld32s_i64:
 +    case INDEX_op_ld_i64:
 +    case INDEX_op_not_i32:
 +    case INDEX_op_not_i64:
 +    case INDEX_op_neg_i32:
 +    case INDEX_op_neg_i64:
 +    case INDEX_op_ext8s_i32:
 +    case INDEX_op_ext8s_i64:
 +    case INDEX_op_ext16s_i32:
 +    case INDEX_op_ext16s_i64:
 +    case INDEX_op_ext8u_i32:
 +    case INDEX_op_ext8u_i64:
 +    case INDEX_op_ext16u_i32:
 +    case INDEX_op_ext16u_i64:
 +    case INDEX_op_ext32s_i64:
 +    case INDEX_op_ext32u_i64:
 +    case INDEX_op_ext_i32_i64:
 +    case INDEX_op_extu_i32_i64:
 +    case INDEX_op_bswap16_i32:
 +    case INDEX_op_bswap16_i64:
++        z_mask = bswap16(z_mask);
++        sign = INT16_MIN;
++        break;
 +    case INDEX_op_bswap32_i32:
 +    case INDEX_op_bswap32_i64:
++        z_mask = bswap32(z_mask);
++        sign = INT32_MIN;
++        break;
 +    case INDEX_op_bswap64_i64:
-+        return C_O1_I1(r, r);
++        z_mask = bswap64(z_mask);
++        sign = INT64_MIN;
--    for (i = 0; i < n; ++i) {
++        break;
--        if (tcg_target_op_defs[i].op == op) {
++    default:
--            return &tcg_target_op_defs[i];
++        g_assert_not_reached();
 +    }
 +
 +    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 +    case TCG_BSWAP_OZ:
 +        break;
 +    case TCG_BSWAP_OS:
 +        /* If the sign bit may be 1, force all the bits above to 1. */
 +        if (z_mask & sign) {
 +            z_mask |= sign;
 +        }
 +        break;
 +    default:
 +        /* The high bits are undefined: force all bits above the sign to 1. */
 +        z_mask |= sign << 1;
 +        break;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
      }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        z_mask = 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        z_mask = 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +
      return false;
  }
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        ctx->z_mask = 32 | 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        ctx->z_mask = 64 | 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
  }
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          t1 = deposit64(t1, op->args[3], op->args[4], t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
 +
 +    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                            op->args[3], op->args[4],
 +                            arg_info(op->args[2])->z_mask);
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
          t = extract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask, sign;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8s):
 +        sign = INT8_MIN;
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16s):
 +        sign = INT16_MIN;
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_ext_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32s_i64:
 +        sign = INT32_MIN;
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    if (z_mask & sign) {
 +        z_mask |= sign;
 +    } else if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extu(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8u):
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16u):
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_extrl_i64_i32:
 +    case INDEX_op_extu_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32u_i64:
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    case INDEX_op_extrh_i64_i32:
 +        type_change = true;
 +        z_mask >>= 32;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    ctx->z_mask = z_mask;
 +    if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    return fold_masks(ctx, op);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
 +    ctx->z_mask = arg_info(op->args[3])->z_mask
 +                | arg_info(op->args[4])->z_mask;
 +
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
  static bool fold_neg(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
 +
 +    /* Set to 1 all bits to the left of the rightmost.  */
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    ctx->z_mask = -(z_mask & -z_mask);
 +
      /*
       * Because of fold_sub_to_neg, we want to always return true,
       * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 +    MemOp mop = get_memop(oi);
 +    int width = 8 * memop_size(mop);
 +
 +    if (!(mop & MO_SIGN) && width < 64) {
 +        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    }
 +
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
 +
 +    ctx->z_mask = 1;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          op->opc = INDEX_op_setcond_i32;
          break;
      }
 +
 +    ctx->z_mask = 1;
      return false;
   do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 +    int64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
          t = sextract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0 && z_mask >= 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +
 +    if (arg_is_const(op->args[2])) {
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 +                                          arg_info(op->args[1])->z_mask,
 +                                          arg_info(op->args[2])->val);
 +        return fold_masks(ctx, op);
 +    }
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
      return fold_addsub2_i32(ctx, op, false);
  }
 +static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* We can't do any folding with a load, but we can record bits. */
 +    switch (op->opc) {
 +    CASE_OP_32_64(ld8u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        break;
 +    CASE_OP_32_64(ld16u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        break;
 +    case INDEX_op_ld32u_i64:
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t z_mask, partmask, affected, tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def;
          bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify using known-zero bits. Currently only ops with a single
 -           output argument is supported. */
 -        z_mask = -1;
 -        affected = -1;
 -        switch (opc) {
 -        CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext8u):
 -            z_mask = 0xff;
 -            goto and_const;
 -        CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext16u):
 -            z_mask = 0xffff;
 -            goto and_const;
 -        case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_ext32u_i64:
 -            z_mask = 0xffffffffU;
 -            goto and_const;
 -
 -        CASE_OP_32_64(and):
 -            z_mask = arg_info(op->args[2])->z_mask;
 -            if (arg_is_const(op->args[2])) {
 -        and_const:
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            z_mask = arg_info(op->args[1])->z_mask & z_mask;
 -            break;
 -
 -        case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_extu_i32_i64:
 -            /* We do not compute affected as it is a size changing op.  */
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(andc):
 -            /* Known-zeros does not imply known-ones.  Therefore unless
 -               op->args[2] is constant, we can't infer anything from it.  */
 -            if (arg_is_const(op->args[2])) {
 -                z_mask = ~arg_info(op->args[2])->z_mask;
 -                goto and_const;
 -            }
 -            /* But we certainly know nothing outside args[1] may be set. */
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        case INDEX_op_sar_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_sar_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_shr_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_shr_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_extrl_i64_i32:
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -        case INDEX_op_extrh_i64_i32:
 -            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
 -            break;
 -
 -        CASE_OP_32_64(shl):
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                z_mask = arg_info(op->args[1])->z_mask << tmp;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(neg):
 -            /* Set to 1 all bits to the left of the rightmost.  */
 -            z_mask = -(arg_info(op->args[1])->z_mask
 -                       & -arg_info(op->args[1])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(deposit):
 -            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 -                               op->args[3], op->args[4],
 -                               arg_info(op->args[2])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(extract):
 -            z_mask = extract64(arg_info(op->args[1])->z_mask,
 -                               op->args[2], op->args[3]);
 -            if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -        CASE_OP_32_64(sextract):
 -            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 -                                op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(xor):
 -            z_mask = arg_info(op->args[1])->z_mask
 -                   | arg_info(op->args[2])->z_mask;
 -            break;
 -
 -        case INDEX_op_clz_i32:
 -        case INDEX_op_ctz_i32:
 -            z_mask = arg_info(op->args[2])->z_mask | 31;
 -            break;
 -
 -        case INDEX_op_clz_i64:
 -        case INDEX_op_ctz_i64:
 -            z_mask = arg_info(op->args[2])->z_mask | 63;
 -            break;
 -
 -        case INDEX_op_ctpop_i32:
 -            z_mask = 32 | 31;
 -            break;
 -        case INDEX_op_ctpop_i64:
 -            z_mask = 64 | 63;
 -            break;
 -
 -        CASE_OP_32_64(setcond):
 -        case INDEX_op_setcond2_i32:
 -            z_mask = 1;
 -            break;
 -
 -        CASE_OP_32_64(movcond):
 -            z_mask = arg_info(op->args[3])->z_mask
 -                   | arg_info(op->args[4])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(ld8u):
 -            z_mask = 0xff;
 -            break;
 -        CASE_OP_32_64(ld16u):
 -            z_mask = 0xffff;
 -            break;
 -        case INDEX_op_ld32u_i64:
 -            z_mask = 0xffffffffu;
 -            break;
 -
 -        CASE_OP_32_64(qemu_ld):
 -            {
 -                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 -                MemOp mop = get_memop(oi);
 -                if (!(mop & MO_SIGN)) {
 -                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 -                }
 -            }
 -            break;
 -
 -        CASE_OP_32_64(bswap16):
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffff) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap16(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int16_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(16, 48);
 -                break;
 -            }
 -            break;
 -
 -        case INDEX_op_bswap32_i64:
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffffffffu) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap32(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int32_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(32, 32);
 -                break;
 -            }
 -            break;
 -
 -        default:
 -            break;
 -        }
-+    case INDEX_op_st8_i32:
+-
-+    case INDEX_op_st16_i32:
+-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-+    case INDEX_op_st_i32:
+-           below, we can ignore high bits, but for further optimizations we
-+    case INDEX_op_st8_i64:
+-           need to record that the high bits contain garbage.  */
-+    case INDEX_op_st16_i64:
+-        partmask = z_mask;
-+    case INDEX_op_st32_i64:
+-        if (ctx.type == TCG_TYPE_I32) {
-+    case INDEX_op_st_i64:
+-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-+        return C_O0_I2(r, r);
+-            partmask &= 0xffffffffu;
-+
+-            affected &= 0xffffffffu;
-+    case INDEX_op_div_i32:
+-        }
-+    case INDEX_op_div_i64:
+-        ctx.z_mask = z_mask;
-+    case INDEX_op_divu_i32:
+-
-+    case INDEX_op_divu_i64:
+-        if (partmask == 0) {
-+    case INDEX_op_rem_i32:
+-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-+    case INDEX_op_rem_i64:
+-            continue;
-+    case INDEX_op_remu_i32:
+-        }
-+    case INDEX_op_remu_i64:
+-        if (affected == 0) {
-+        return C_O1_I2(r, r, r);
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+
+-            continue;
-+    case INDEX_op_add_i32:
+-        }
-+    case INDEX_op_add_i64:
++        /* Assume all bits affected, and no bits known zero. */
-+    case INDEX_op_sub_i32:
++        ctx.a_mask = -1;
-+    case INDEX_op_sub_i64:
++        ctx.z_mask = -1;
-+    case INDEX_op_mul_i32:
-+    case INDEX_op_mul_i64:
+         /*
-+    case INDEX_op_and_i32:
+          * Process each opcode.
-+    case INDEX_op_and_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    case INDEX_op_andc_i32:
+         case INDEX_op_extrh_i64_i32:
-+    case INDEX_op_andc_i64:
+             done = fold_extu(&ctx, op);
-+    case INDEX_op_eqv_i32:
+             break;
-+    case INDEX_op_eqv_i64:
++        CASE_OP_32_64(ld8u):
-+    case INDEX_op_nand_i32:
++        CASE_OP_32_64(ld16u):
-+    case INDEX_op_nand_i64:
++        case INDEX_op_ld32u_i64:
-+    case INDEX_op_nor_i32:
++            done = fold_tcg_ld(&ctx, op);
-+    case INDEX_op_nor_i64:
++            break;
-+    case INDEX_op_or_i32:
+         case INDEX_op_mb:
-+    case INDEX_op_or_i64:
+             done = fold_mb(&ctx, op);
-+    case INDEX_op_orc_i32:
+             break;
 +    case INDEX_op_orc_i64:
 +    case INDEX_op_xor_i32:
 +    case INDEX_op_xor_i64:
 +    case INDEX_op_shl_i32:
 +    case INDEX_op_shl_i64:
 +    case INDEX_op_shr_i32:
 +    case INDEX_op_shr_i64:
 +    case INDEX_op_sar_i32:
 +    case INDEX_op_sar_i64:
 +    case INDEX_op_rotl_i32:
 +    case INDEX_op_rotl_i64:
 +    case INDEX_op_rotr_i32:
 +    case INDEX_op_rotr_i64:
 +        /* TODO: Does R, RI, RI result in faster code than R, R, RI? */
 +        return C_O1_I2(r, ri, ri);
 +
 +    case INDEX_op_deposit_i32:
 +    case INDEX_op_deposit_i64:
 +        return C_O1_I2(r, 0, r);
 +
 +    case INDEX_op_brcond_i32:
 +    case INDEX_op_brcond_i64:
 +        return C_O0_I2(r, ri);
 +
 +    case INDEX_op_setcond_i32:
 +    case INDEX_op_setcond_i64:
 +        return C_O1_I2(r, r, ri);
 +
 +#if TCG_TARGET_REG_BITS == 32
 +    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
 +    case INDEX_op_add2_i32:
 +    case INDEX_op_sub2_i32:
 +        return C_O2_I4(r, r, r, r, r, r);
 +    case INDEX_op_brcond2_i32:
 +        return C_O0_I4(r, r, ri, ri);
 +    case INDEX_op_mulu2_i32:
 +        return C_O2_I2(r, r, r, r);
 +    case INDEX_op_setcond2_i32
 +        return C_O1_I4(r, r, r, ri, ri);
 +#endif
 +
 +    case INDEX_op_qemu_ld_i32:
 +        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 +                ? C_O1_I1(r, L)
 +                : C_O1_I2(r, L, L));
 +    case INDEX_op_qemu_ld_i64:
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
 +                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
 +                : C_O2_I2(r, r, L, L));
 +    case INDEX_op_qemu_st_i32:
 +        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 +                ? C_O0_I2(r, S)
 +                : C_O0_I3(r, S, S));
 +    case INDEX_op_qemu_st_i64:
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, S)
 +                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(r, r, S)
 +                : C_O0_I4(r, r, S, S));
 +
 +    default:
 +        return NULL;
      }
 -    return NULL;
  }
  static const int tcg_target_reg_alloc_order[] = {
 --
 .25.1

-[PATCH 13/43] tcg: Consolidate 3 bits into enum TCGTempKind
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
-The temp_fixed, temp_global, temp_local bits are all related.
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
-Combine them into a single enumeration.
+and muls2_i64.
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  20 +++++---
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
- tcg/optimize.c    |   8 +--
+file changed, 35 insertions(+), 9 deletions(-)
  tcg/tcg.c         | 122 ++++++++++++++++++++++++++++------------------
 files changed, 90 insertions(+), 60 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
-+++ b/include/tcg/tcg.h
-@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempVal {
-     TEMP_VAL_CONST,
- } TCGTempVal;
-+typedef enum TCGTempKind {
-+    /* Temp is dead at the end of all basic blocks. */
-+    TEMP_NORMAL,
-+    /* Temp is saved across basic blocks but dead at the end of TBs. */
-+    TEMP_LOCAL,
-+    /* Temp is saved across both basic blocks and translation blocks. */
-+    TEMP_GLOBAL,
-+    /* Temp is in a fixed register. */
-+    TEMP_FIXED,
-+} TCGTempKind;
-+
- typedef struct TCGTemp {
-     TCGReg reg:8;
-     TCGTempVal val_type:8;
-     TCGType base_type:8;
-     TCGType type:8;
--    unsigned int fixed_reg:1;
-+    TCGTempKind kind:3;
-     unsigned int indirect_reg:1;
-     unsigned int indirect_base:1;
-     unsigned int mem_coherent:1;
-     unsigned int mem_allocated:1;
--    /* If true, the temp is saved across both basic blocks and
--       translation blocks.  */
--    unsigned int temp_global:1;
--    /* If true, the temp is saved across basic blocks but dead
--       at the end of translation blocks.  If false, the temp is
--       dead at the end of basic blocks.  */
--    unsigned int temp_local:1;
-     unsigned int temp_allocated:1;
-     tcg_target_long val;
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-     TCGTemp *i;
+     return false;
      /* If this is already a global, we can't do better. */
 -    if (ts->temp_global) {
 +    if (ts->kind >= TEMP_GLOBAL) {
          return ts;
      }
      /* Search for a global first. */
      for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
 -        if (i->temp_global) {
 +        if (i->kind >= TEMP_GLOBAL) {
              return i;
          }
      }
      /* If it is a temp, search for a temp local. */
 -    if (!ts->temp_local) {
 +    if (ts->kind == TEMP_NORMAL) {
          for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
 -            if (ts->temp_local) {
 +            if (i->kind >= TEMP_LOCAL) {
                  return i;
              }
          }
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static inline TCGTemp *tcg_global_alloc(TCGContext *s)
      tcg_debug_assert(s->nb_globals == s->nb_temps);
      s->nb_globals++;
      ts = tcg_temp_alloc(s);
 -    ts->temp_global = 1;
 +    ts->kind = TEMP_GLOBAL;
      return ts;
  }
-@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
-     ts = tcg_global_alloc(s);
+-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
-     ts->base_type = type;
++static bool fold_multiply2(OptContext *ctx, TCGOp *op)
      ts->type = type;
 -    ts->fixed_reg = 1;
 +    ts->kind = TEMP_FIXED;
      ts->reg = reg;
      ts->name = name;
      tcg_regset_set_reg(s->reserved_regs, reg);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
      bigendian = 1;
  #endif
 -    if (!base_ts->fixed_reg) {
 +    if (base_ts->kind != TEMP_FIXED) {
          /* We do not support double-indirect registers.  */
          tcg_debug_assert(!base_ts->indirect_reg);
          base_ts->indirect_base = 1;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
  TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
  {
-     TCGContext *s = tcg_ctx;
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-+    TCGTempKind kind = temp_local ? TEMP_LOCAL : TEMP_NORMAL;
+-        uint32_t a = arg_info(op->args[2])->val;
-     TCGTemp *ts;
+-        uint32_t b = arg_info(op->args[3])->val;
-     int idx, k;
+-        uint64_t r = (uint64_t)a * b;
++        uint64_t a = arg_info(op->args[2])->val;
-@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
++        uint64_t b = arg_info(op->args[3])->val;
-         ts = &s->temps[idx];
++        uint64_t h, l;
-         ts->temp_allocated = 1;
+         TCGArg rl, rh;
-         tcg_debug_assert(ts->base_type == type);
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
--        tcg_debug_assert(ts->temp_local == temp_local);
++        TCGOp *op2;
 +        tcg_debug_assert(ts->kind == kind);
      } else {
          ts = tcg_temp_alloc(s);
          if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
              ts->base_type = type;
              ts->type = TCG_TYPE_I32;
              ts->temp_allocated = 1;
 -            ts->temp_local = temp_local;
 +            ts->kind = kind;
              tcg_debug_assert(ts2 == ts + 1);
              ts2->base_type = TCG_TYPE_I64;
              ts2->type = TCG_TYPE_I32;
              ts2->temp_allocated = 1;
 -            ts2->temp_local = temp_local;
 +            ts2->kind = kind;
          } else {
              ts->base_type = type;
              ts->type = type;
              ts->temp_allocated = 1;
 -            ts->temp_local = temp_local;
 +            ts->kind = kind;
          }
      }
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
      }
  #endif
 -    tcg_debug_assert(ts->temp_global == 0);
 +    tcg_debug_assert(ts->kind < TEMP_GLOBAL);
      tcg_debug_assert(ts->temp_allocated != 0);
      ts->temp_allocated = 0;
      idx = temp_idx(ts);
 -    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
 +    k = ts->base_type + (ts->kind == TEMP_NORMAL ? 0 : TCG_TYPE_COUNT);
      set_bit(idx, s->free_temps[k].l);
  }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
  static void tcg_reg_alloc_start(TCGContext *s)
  {
      int i, n;
 -    TCGTemp *ts;
 -    for (i = 0, n = s->nb_globals; i < n; i++) {
 -        ts = &s->temps[i];
 -        ts->val_type = (ts->fixed_reg ? TEMP_VAL_REG : TEMP_VAL_MEM);
 -    }
 -    for (n = s->nb_temps; i < n; i++) {
 -        ts = &s->temps[i];
 -        ts->val_type = (ts->temp_local ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
 -        ts->mem_allocated = 0;
 -        ts->fixed_reg = 0;
 +    for (i = 0, n = s->nb_temps; i < n; i++) {
 +        TCGTemp *ts = &s->temps[i];
 +        TCGTempVal val = TEMP_VAL_MEM;
 +
-+        switch (ts->kind) {
++        switch (op->opc) {
-+        case TEMP_FIXED:
++        case INDEX_op_mulu2_i32:
-+            val = TEMP_VAL_REG;
++            l = (uint64_t)(uint32_t)a * (uint32_t)b;
 +            h = (int32_t)(l >> 32);
 +            l = (int32_t)l;
 +            break;
-+        case TEMP_GLOBAL:
++        case INDEX_op_muls2_i32:
 +            l = (int64_t)(int32_t)a * (int32_t)b;
 +            h = l >> 32;
 +            l = (int32_t)l;
 +            break;
-+        case TEMP_NORMAL:
++        case INDEX_op_mulu2_i64:
-+            val = TEMP_VAL_DEAD;
++            mulu64(&l, &h, a, b);
-+            /* fall through */
++            break;
-+        case TEMP_LOCAL:
++        case INDEX_op_muls2_i64:
-+            ts->mem_allocated = 0;
++            muls64(&l, &h, a, b);
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
-+        ts->val_type = val;
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, l);
 +        tcg_opt_gen_movi(ctx, op2, rh, h);
          return true;
      }
+     return false;
-     memset(s->reg_to_temp, 0, sizeof(s->reg_to_temp));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
+         CASE_OP_32_64(muluh):
- {
+             done = fold_mul_highpart(&ctx, op);
-     int idx = temp_idx(ts);
+             break;
+-        case INDEX_op_mulu2_i32:
--    if (ts->temp_global) {
+-            done = fold_mulu2_i32(&ctx, op);
-+    switch (ts->kind) {
++        CASE_OP_32_64(muls2):
-+    case TEMP_FIXED:
++        CASE_OP_32_64(mulu2):
-+    case TEMP_GLOBAL:
++            done = fold_multiply2(&ctx, op);
-         pstrcpy(buf, buf_size, ts->name);
+             break;
--    } else if (ts->temp_local) {
+         CASE_OP_32_64(nand):
-+        break;
+             done = fold_nand(&ctx, op);
 +    case TEMP_LOCAL:
          snprintf(buf, buf_size, "loc%d", idx - s->nb_globals);
 -    } else {
 +        break;
 +    case TEMP_NORMAL:
          snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
 +        break;
      }
      return buf;
  }
@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
  {
      int i;
 -    for (i = 0; i < ng; ++i) {
 -        s->temps[i].state = TS_DEAD | TS_MEM;
 -        la_reset_pref(&s->temps[i]);
 -    }
 -    for (i = ng; i < nt; ++i) {
 -        s->temps[i].state = (s->temps[i].temp_local
 -                             ? TS_DEAD | TS_MEM
 -                             : TS_DEAD);
 -        la_reset_pref(&s->temps[i]);
 +    for (i = 0; i < nt; ++i) {
 +        TCGTemp *ts = &s->temps[i];
 +        int state;
 +
 +        switch (ts->kind) {
 +        case TEMP_FIXED:
 +        case TEMP_GLOBAL:
 +        case TEMP_LOCAL:
 +            state = TS_DEAD | TS_MEM;
 +            break;
 +        case TEMP_NORMAL:
 +            state = TS_DEAD;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
 +        ts->state = state;
 +        la_reset_pref(ts);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void check_regs(TCGContext *s)
      }
      for (k = 0; k < s->nb_temps; k++) {
          ts = &s->temps[k];
 -        if (ts->val_type == TEMP_VAL_REG && !ts->fixed_reg
 +        if (ts->val_type == TEMP_VAL_REG
 +            && ts->kind != TEMP_FIXED
              && s->reg_to_temp[ts->reg] != ts) {
              printf("Inconsistency for temp %s:\n",
                     tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
     mark it free; otherwise mark it dead.  */
  static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
  {
 -    if (ts->fixed_reg) {
 +    if (ts->kind == TEMP_FIXED) {
          return;
      }
      if (ts->val_type == TEMP_VAL_REG) {
          s->reg_to_temp[ts->reg] = NULL;
      }
      ts->val_type = (free_or_dead < 0
 -                    || ts->temp_local
 -                    || ts->temp_global
 +                    || ts->kind != TEMP_NORMAL
                      ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
  }
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
  static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                        TCGRegSet preferred_regs, int free_or_dead)
  {
 -    if (ts->fixed_reg) {
 +    if (ts->kind == TEMP_FIXED) {
          return;
      }
      if (!ts->mem_coherent) {
@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
  {
      /* The liveness analysis already ensures that globals are back
         in memory. Keep an tcg_debug_assert for safety. */
 -    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg);
 +    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
 +                     || ts->kind == TEMP_FIXED);
  }
  /* save globals to their canonical location and assume they can be
@@ -XXX,XX +XXX,XX @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
      for (i = 0, n = s->nb_globals; i < n; i++) {
          TCGTemp *ts = &s->temps[i];
          tcg_debug_assert(ts->val_type != TEMP_VAL_REG
 -                         || ts->fixed_reg
 +                         || ts->kind == TEMP_FIXED
                           || ts->mem_coherent);
      }
  }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
      for (i = s->nb_globals; i < s->nb_temps; i++) {
          TCGTemp *ts = &s->temps[i];
 -        if (ts->temp_local) {
 +        if (ts->kind == TEMP_LOCAL) {
              temp_save(s, ts, allocated_regs);
          } else {
              /* The liveness analysis already ensures that temps are dead.
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                    TCGRegSet preferred_regs)
  {
      /* ENV should not be modified.  */
 -    tcg_debug_assert(!ots->fixed_reg);
 +    tcg_debug_assert(ots->kind != TEMP_FIXED);
      /* The movi is not explicitly generated here.  */
      if (ots->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
      ts = arg_temp(op->args[1]);
      /* ENV should not be modified.  */
 -    tcg_debug_assert(!ots->fixed_reg);
 +    tcg_debug_assert(ots->kind != TEMP_FIXED);
      /* Note that otype != itype for no-op truncation.  */
      otype = ots->type;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
          }
          temp_dead(s, ots);
      } else {
 -        if (IS_DEAD_ARG(1) && !ts->fixed_reg) {
 +        if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
              /* the mov can be suppressed */
              if (ots->val_type == TEMP_VAL_REG) {
                  s->reg_to_temp[ots->reg] = NULL;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                   * Store the source register into the destination slot
                   * and leave the destination temp as TEMP_VAL_MEM.
                   */
 -                assert(!ots->fixed_reg);
 +                assert(ots->kind != TEMP_FIXED);
                  if (!ts->mem_allocated) {
                      temp_allocate_frame(s, ots);
                  }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
      its = arg_temp(op->args[1]);
      /* ENV should not be modified.  */
 -    tcg_debug_assert(!ots->fixed_reg);
 +    tcg_debug_assert(ots->kind != TEMP_FIXED);
      itype = its->type;
      vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          i_preferred_regs = o_preferred_regs = 0;
          if (arg_ct->ialias) {
              o_preferred_regs = op->output_pref[arg_ct->alias_index];
 -            if (ts->fixed_reg) {
 +            if (ts->kind == TEMP_FIXED) {
                  /* if fixed register, we must allocate a new register
                     if the alias is not the same register */
                  if (arg != op->args[arg_ct->alias_index]) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
              ts = arg_temp(arg);
              /* ENV should not be modified.  */
 -            tcg_debug_assert(!ts->fixed_reg);
 +            tcg_debug_assert(ts->kind != TEMP_FIXED);
              if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                  reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          ts = arg_temp(op->args[i]);
          /* ENV should not be modified.  */
 -        tcg_debug_assert(!ts->fixed_reg);
 +        tcg_debug_assert(ts->kind != TEMP_FIXED);
          if (NEED_SYNC_ARG(i)) {
              temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
          ts = arg_temp(arg);
          /* ENV should not be modified.  */
 -        tcg_debug_assert(!ts->fixed_reg);
 +        tcg_debug_assert(ts->kind != TEMP_FIXED);
          reg = tcg_target_call_oarg_regs[i];
          tcg_debug_assert(s->reg_to_temp[reg] == NULL);
 --
 .25.1

-[PATCH 16/43] tcg: Rename struct tcg_temp_info to TempOptInfo
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-Fix this name vs our coding style.
+Rename to fold_addsub2.
 Use Int128 to implement the wider operation.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 32 ++++++++++++++++----------------
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
-file changed, 16 insertions(+), 16 deletions(-)
+file changed, 44 insertions(+), 21 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
 @@ -XXX,XX +XXX,XX @@
-         glue(glue(case INDEX_op_, x), _i64):    \
+  */
-         glue(glue(case INDEX_op_, x), _vec)
+ #include "qemu/osdep.h"
--struct tcg_temp_info {
++#include "qemu/int128.h"
-+typedef struct TempOptInfo {
+ #include "tcg/tcg-op.h"
-     bool is_const;
+ #include "tcg-internal.h"
-     TCGTemp *prev_copy;
-     TCGTemp *next_copy;
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
-     tcg_target_ulong val;
+     return false;
-     tcg_target_ulong mask;
+ }
--};
-+} TempOptInfo;
+-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
++static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 -static inline struct tcg_temp_info *ts_info(TCGTemp *ts)
 +static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
-     return ts->state_ptr;
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
          arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
 -        uint32_t al = arg_info(op->args[2])->val;
 -        uint32_t ah = arg_info(op->args[3])->val;
 -        uint32_t bl = arg_info(op->args[4])->val;
 -        uint32_t bh = arg_info(op->args[5])->val;
 -        uint64_t a = ((uint64_t)ah << 32) | al;
 -        uint64_t b = ((uint64_t)bh << 32) | bl;
 +        uint64_t al = arg_info(op->args[2])->val;
 +        uint64_t ah = arg_info(op->args[3])->val;
 +        uint64_t bl = arg_info(op->args[4])->val;
 +        uint64_t bh = arg_info(op->args[5])->val;
          TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 -        if (add) {
 -            a += b;
 +        if (ctx->type == TCG_TYPE_I32) {
 +            uint64_t a = deposit64(al, 32, 32, ah);
 +            uint64_t b = deposit64(bl, 32, 32, bh);
 +
 +            if (add) {
 +                a += b;
 +            } else {
 +                a -= b;
 +            }
 +
 +            al = sextract64(a, 0, 32);
 +            ah = sextract64(a, 32, 32);
          } else {
 -            a -= b;
 +            Int128 a = int128_make128(al, ah);
 +            Int128 b = int128_make128(bl, bh);
 +
 +            if (add) {
 +                a = int128_add(a, b);
 +            } else {
 +                a = int128_sub(a, b);
 +            }
 +
 +            al = int128_getlo(a);
 +            ah = int128_gethi(a);
          }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, al);
 +        tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
      return false;
  }
--static inline struct tcg_temp_info *arg_info(TCGArg arg)
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-+static inline TempOptInfo *arg_info(TCGArg arg)
++static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
-     return ts_info(arg_temp(arg));
+-    return fold_addsub2_i32(ctx, op, true);
 +    return fold_addsub2(ctx, op, true);
  }
-@@ -XXX,XX +XXX,XX @@ static inline bool ts_is_copy(TCGTemp *ts)
- /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
+ static bool fold_and(OptContext *ctx, TCGOp *op)
- static void reset_ts(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
      return false;
  }
 -static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 +static bool fold_sub2(OptContext *ctx, TCGOp *op)
  {
--    struct tcg_temp_info *ti = ts_info(ts);
+-    return fold_addsub2_i32(ctx, op, false);
--    struct tcg_temp_info *pi = ts_info(ti->prev_copy);
++    return fold_addsub2(ctx, op, false);
 -    struct tcg_temp_info *ni = ts_info(ti->next_copy);
 +    TempOptInfo *ti = ts_info(ts);
 +    TempOptInfo *pi = ts_info(ti->prev_copy);
 +    TempOptInfo *ni = ts_info(ti->next_copy);
      ni->prev_copy = ti->prev_copy;
      pi->next_copy = ti->next_copy;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
  }
- /* Initialize and activate a temporary.  */
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 -static void init_ts_info(struct tcg_temp_info *infos,
 +static void init_ts_info(TempOptInfo *infos,
                           TCGTempSet *temps_used, TCGTemp *ts)
  {
      size_t idx = temp_idx(ts);
      if (!test_bit(idx, temps_used->l)) {
 -        struct tcg_temp_info *ti = &infos[idx];
 +        TempOptInfo *ti = &infos[idx];
          ts->state_ptr = ti;
          ti->next_copy = ts;
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(struct tcg_temp_info *infos,
      }
  }
 -static void init_arg_info(struct tcg_temp_info *infos,
 +static void init_arg_info(TempOptInfo *infos,
                            TCGTempSet *temps_used, TCGArg arg)
  {
      init_ts_info(infos, temps_used, arg_temp(arg));
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
      const TCGOpDef *def;
      TCGOpcode new_op;
      tcg_target_ulong mask;
 -    struct tcg_temp_info *di = arg_info(dst);
 +    TempOptInfo *di = arg_info(dst);
      def = &tcg_op_defs[op->opc];
      if (def->flags & TCG_OPF_VECTOR) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
      const TCGOpDef *def;
 -    struct tcg_temp_info *di;
 -    struct tcg_temp_info *si;
 +    TempOptInfo *di;
 +    TempOptInfo *si;
      tcg_target_ulong mask;
      TCGOpcode new_op;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      di->mask = mask;
      if (src_ts->type == dst_ts->type) {
 -        struct tcg_temp_info *ni = ts_info(si->next_copy);
 +        TempOptInfo *ni = ts_info(si->next_copy);
          di->next_copy = si->next_copy;
          di->prev_copy = src_ts;
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- {
+         CASE_OP_32_64_VEC(add):
-     int nb_temps, nb_globals;
+             done = fold_add(&ctx, op);
-     TCGOp *op, *op_next, *prev_mb = NULL;
+             break;
--    struct tcg_temp_info *infos;
+-        case INDEX_op_add2_i32:
-+    TempOptInfo *infos;
+-            done = fold_add2_i32(&ctx, op);
-     TCGTempSet temps_used;
++        CASE_OP_32_64(add2):
++            done = fold_add2(&ctx, op);
-     /* Array VALS has an element for each temp.
+             break;
          CASE_OP_32_64_VEC(and):
              done = fold_and(&ctx, op);
 @@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     nb_temps = s->nb_temps;
+         CASE_OP_32_64_VEC(sub):
-     nb_globals = s->nb_globals;
+             done = fold_sub(&ctx, op);
-     bitmap_zero(temps_used.l, nb_temps);
+             break;
--    infos = tcg_malloc(sizeof(struct tcg_temp_info) * nb_temps);
+-        case INDEX_op_sub2_i32:
-+    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
+-            done = fold_sub2_i32(&ctx, op);
++        CASE_OP_32_64(sub2):
-     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
++            done = fold_sub2(&ctx, op);
-         tcg_target_ulong mask, partmask, affected;
+             break;
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
 --
 .25.1

-[PATCH 35/43] tcg/i386: Convert to tcg-constr.c.inc
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
+Most of these are handled by creating a fold_const2_commutative
+to handle all of the binary operators.  The rest were already
+handled on a case-by-case basis in the switch, and have their
+own fold function in which to place the call.
+We now have only one major switch on TCGOpcode.
+Introduce NO_DEST and a block comment for swap_commutative in
+order to make the handling of brcond and movcond opcodes cleaner.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/i386/tcg-target-constr.h |  55 +++++++++++
+ tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
- tcg/i386/tcg-target.c.inc    | 187 +++++++++++++----------------------
+file changed, 70 insertions(+), 72 deletions(-)
-files changed, 121 insertions(+), 121 deletions(-)
- create mode 100644 tcg/i386/tcg-target-constr.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
-diff --git a/tcg/i386/tcg-target-constr.h b/tcg/i386/tcg-target-constr.h
+--- a/tcg/optimize.c
-new file mode 100644
++++ b/tcg/optimize.c
-index XXXXXXX..XXXXXXX
+@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
---- /dev/null
+     return -1;
-+++ b/tcg/i386/tcg-target-constr.h
+ }
-@@ -XXX,XX +XXX,XX @@
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
++/**
-+/*
++ * swap_commutative:
-+ * i386 target-specific operand constaints.
++ * @dest: TCGArg of the destination argument, or NO_DEST.
-+ * Copyright (c) 2020 Linaro
++ * @p1: first paired argument
 + * @p2: second paired argument
 + *
 + * If *@p1 is a constant and *@p2 is not, swap.
 + * If *@p2 matches @dest, swap.
 + * Return true if a swap was performed.
 + */
 +
-+C_O0_I1(r)
++#define NO_DEST  temp_arg(NULL)
 +
-+C_O0_I2(qi, r)
+ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
-+C_O0_I2(ri, r)
+ {
-+C_O0_I2(re, r)
+     TCGArg a1 = *p1, a2 = *p2;
-+C_O0_I2(r, re)
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-+C_O0_I2(L, L)
+     return false;
 +C_O0_I2(x, r)
 +
 +C_O0_I3(L, L, L)
 +
 +C_O0_I4(L, L, L, L)
 +C_O0_I4(r, r, ri, ri)
 +
 +C_O1_I1(r, 0)
 +C_O1_I1(r, q)
 +C_O1_I1(r, r)
 +C_O1_I1(r, L)
 +C_O1_I1(x, r)
 +C_O1_I1(x, x)
 +
 +C_O1_I2(r, r, re)
 +C_O1_I2(r, 0, r)
 +C_O1_I2(r, 0, re)
 +C_O1_I2(r, 0, reZ)
 +C_O1_I2(r, 0, rI)
 +C_O1_I2(r, 0, ri)
 +C_O1_I2(r, 0, ci)
 +C_O1_I2(r, r, ri)
 +C_O1_I2(Q, 0, Q)
 +C_O1_I2(q, r, re)
 +C_O1_I2(r, L, L)
 +C_O1_I2(x, x, x)
 +C_N1_I2(r, r, r)
 +C_N1_I2(r, r, rW)
 +
 +C_O1_I3(x, x, x, x)
 +
 +C_O1_I4(r, r, re, r, 0)
 +C_O1_I4(r, r, r, ri, ri)
 +
 +C_O2_I1(r, r, L)
 +
 +C_O2_I2(r, r, L, L)
 +C_O2_I2(a, d, a, r)
 +
 +C_O2_I3(a, d, 0, 1, r)
 +
 +C_O2_I4(r, r, 0, 1, re, re)
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
      }
  }
-+/* Define all constraint sets. */
++static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
-+#include "../tcg-constr.c.inc"
++{
-+
++    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
- static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
++    return fold_const2(ctx, op);
- {
++}
--    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
++
--    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
+ static bool fold_masks(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
+ {
--    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
+     uint64_t a_mask = ctx->a_mask;
--    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
--    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
+ static bool fold_add(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
+ {
--    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
+-    if (fold_const2(ctx, op) ||
--    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
++    if (fold_const2_commutative(ctx, op) ||
--    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
+         fold_xi_to_x(ctx, op, 0)) {
--    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
+         return true;
--    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
+     }
--    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
+@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
--    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
--    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
+ static bool fold_add2(OptContext *ctx, TCGOp *op)
--    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
+ {
--    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
++    /* Note that the high and low parts may be independently swapped. */
--    static const TCGTargetOpDef r_r_L_L
++    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
--        = { .args_ct_str = { "r", "r", "L", "L" } };
++    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
--    static const TCGTargetOpDef L_L_L_L
++
--        = { .args_ct_str = { "L", "L", "L", "L" } };
+     return fold_addsub2(ctx, op, true);
--    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
+ }
--    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
--    static const TCGTargetOpDef x_x_x_x
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
--        = { .args_ct_str = { "x", "x", "x", "x" } };
+ {
--    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
+     uint64_t z1, z2;
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
 +        op->args[2] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* For commutative operations make constant second argument */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 -            break;
 -        CASE_OP_32_64(brcond):
 -            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
 -                op->args[2] = tcg_swap_cond(op->args[2]);
 -            }
 -            break;
 -        CASE_OP_32_64(setcond):
 -            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 -                op->args[3] = tcg_swap_cond(op->args[3]);
 -            }
 -            break;
 -        CASE_OP_32_64(movcond):
 -            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            /* For movcond, we canonicalize the "false" input reg to match
 -               the destination reg so that the tcg backend can implement
 -               a "move if true" operation.  */
 -            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -                op->args[5] = tcg_invert_cond(op->args[5]);
 -            }
 -            break;
 -        CASE_OP_32_64(add2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 -            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 -            break;
 -        CASE_OP_32_64(mulu2):
 -        CASE_OP_32_64(muls2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 -            break;
 -        case INDEX_op_brcond2_i32:
 -            if (swap_commutative2(&op->args[0], &op->args[2])) {
 -                op->args[4] = tcg_swap_cond(op->args[4]);
 -            }
 -            break;
 -        case INDEX_op_setcond2_i32:
 -            if (swap_commutative2(&op->args[1], &op->args[3])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
-     switch (op) {
+         /* Assume all bits affected, and no bits known zero. */
-     case INDEX_op_goto_ptr:
+         ctx.a_mask = -1;
--        return &r;
+         ctx.z_mask = -1;
 +        return C_O0_I1(r);
      case INDEX_op_ld8u_i32:
      case INDEX_op_ld8u_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_ld32u_i64:
      case INDEX_op_ld32s_i64:
      case INDEX_op_ld_i64:
 -        return &r_r;
 +        return C_O1_I1(r, r);
      case INDEX_op_st8_i32:
      case INDEX_op_st8_i64:
 -        return &qi_r;
 +        return C_O0_I2(qi, r);
 +
      case INDEX_op_st16_i32:
      case INDEX_op_st16_i64:
      case INDEX_op_st_i32:
      case INDEX_op_st32_i64:
 -        return &ri_r;
 +        return C_O0_I2(ri, r);
 +
      case INDEX_op_st_i64:
 -        return &re_r;
 +        return C_O0_I2(re, r);
      case INDEX_op_add_i32:
      case INDEX_op_add_i64:
 -        return &r_r_re;
 +        return C_O1_I2(r, r, re);
 +
      case INDEX_op_sub_i32:
      case INDEX_op_sub_i64:
      case INDEX_op_mul_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_or_i64:
      case INDEX_op_xor_i32:
      case INDEX_op_xor_i64:
 -        return &r_0_re;
 +        return C_O1_I2(r, 0, re);
      case INDEX_op_and_i32:
      case INDEX_op_and_i64:
 -        {
 -            static const TCGTargetOpDef and
 -                = { .args_ct_str = { "r", "0", "reZ" } };
 -            return &and;
 -        }
 -        break;
 +        return C_O1_I2(r, 0, reZ);
 +
      case INDEX_op_andc_i32:
      case INDEX_op_andc_i64:
 -        {
 -            static const TCGTargetOpDef andc
 -                = { .args_ct_str = { "r", "r", "rI" } };
 -            return &andc;
 -        }
 -        break;
 +        return C_O1_I2(r, 0, rI);
      case INDEX_op_shl_i32:
      case INDEX_op_shl_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_shr_i64:
      case INDEX_op_sar_i32:
      case INDEX_op_sar_i64:
 -        return have_bmi2 ? &r_r_ri : &r_0_ci;
 +        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
 +
      case INDEX_op_rotl_i32:
      case INDEX_op_rotl_i64:
      case INDEX_op_rotr_i32:
      case INDEX_op_rotr_i64:
 -        return &r_0_ci;
 +        return C_O1_I2(r, 0, ci);
      case INDEX_op_brcond_i32:
      case INDEX_op_brcond_i64:
 -        return &r_re;
 +        return C_O0_I2(r, re);
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_not_i32:
      case INDEX_op_not_i64:
      case INDEX_op_extrh_i64_i32:
 -        return &r_0;
 +        return C_O1_I1(r, 0);
      case INDEX_op_ext8s_i32:
      case INDEX_op_ext8s_i64:
      case INDEX_op_ext8u_i32:
      case INDEX_op_ext8u_i64:
 -        return &r_q;
 +        return C_O1_I1(r, q);
 +
      case INDEX_op_ext16s_i32:
      case INDEX_op_ext16s_i64:
      case INDEX_op_ext16u_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
      case INDEX_op_sextract_i32:
      case INDEX_op_ctpop_i32:
      case INDEX_op_ctpop_i64:
 -        return &r_r;
 +        return C_O1_I1(r, r);
 +
      case INDEX_op_extract2_i32:
      case INDEX_op_extract2_i64:
 -        return &r_0_r;
 +        return C_O1_I2(r, 0, r);
      case INDEX_op_deposit_i32:
      case INDEX_op_deposit_i64:
 -        {
 -            static const TCGTargetOpDef dep
 -                = { .args_ct_str = { "Q", "0", "Q" } };
 -            return &dep;
 -        }
 +        return C_O1_I2(Q, 0, Q);
 +
      case INDEX_op_setcond_i32:
      case INDEX_op_setcond_i64:
 -        {
 -            static const TCGTargetOpDef setc
 -                = { .args_ct_str = { "q", "r", "re" } };
 -            return &setc;
 -        }
 +        return C_O1_I2(q, r, re);
 +
      case INDEX_op_movcond_i32:
      case INDEX_op_movcond_i64:
 -        {
 -            static const TCGTargetOpDef movc
 -                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
 -            return &movc;
 -        }
 +        return C_O1_I4(r, r, re, r, 0);
 +
      case INDEX_op_div2_i32:
      case INDEX_op_div2_i64:
      case INDEX_op_divu2_i32:
      case INDEX_op_divu2_i64:
 -        {
 -            static const TCGTargetOpDef div2
 -                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
 -            return &div2;
 -        }
 +        return C_O2_I3(a, d, 0, 1, r);
 +
      case INDEX_op_mulu2_i32:
      case INDEX_op_mulu2_i64:
      case INDEX_op_muls2_i32:
      case INDEX_op_muls2_i64:
 -        {
 -            static const TCGTargetOpDef mul2
 -                = { .args_ct_str = { "a", "d", "a", "r" } };
 -            return &mul2;
 -        }
 +        return C_O2_I2(a, d, a, r);
 +
      case INDEX_op_add2_i32:
      case INDEX_op_add2_i64:
      case INDEX_op_sub2_i32:
      case INDEX_op_sub2_i64:
 -        {
 -            static const TCGTargetOpDef arith2
 -                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
 -            return &arith2;
 -        }
 +        return C_O2_I4(r, r, 0, 1, re, re);
 +
      case INDEX_op_ctz_i32:
      case INDEX_op_ctz_i64:
 -        {
 -            static const TCGTargetOpDef ctz[2] = {
 -                { .args_ct_str = { "&r", "r", "r" } },
 -                { .args_ct_str = { "&r", "r", "rW" } },
 -            };
 -            return &ctz[have_bmi1];
 -        }
 +        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
 +
      case INDEX_op_clz_i32:
      case INDEX_op_clz_i64:
 -        {
 -            static const TCGTargetOpDef clz[2] = {
 -                { .args_ct_str = { "&r", "r", "r" } },
 -                { .args_ct_str = { "&r", "r", "rW" } },
 -            };
 -            return &clz[have_lzcnt];
 -        }
 +        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
      case INDEX_op_qemu_ld_i32:
 -        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
 +        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 +                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
 +
      case INDEX_op_qemu_st_i32:
 -        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
 +        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
 +                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
 +
      case INDEX_op_qemu_ld_i64:
 -        return (TCG_TARGET_REG_BITS == 64 ? &r_L
 -                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
 -                : &r_r_L_L);
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
 +                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
 +                : C_O2_I2(r, r, L, L));
 +
      case INDEX_op_qemu_st_i64:
 -        return (TCG_TARGET_REG_BITS == 64 ? &L_L
 -                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
 -                : &L_L_L_L);
 +        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
 +                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
 +                : C_O0_I4(L, L, L, L));
      case INDEX_op_brcond2_i32:
 -        {
 -            static const TCGTargetOpDef b2
 -                = { .args_ct_str = { "r", "r", "ri", "ri" } };
 -            return &b2;
 -        }
 +        return C_O0_I4(r, r, ri, ri);
 +
      case INDEX_op_setcond2_i32:
 -        {
 -            static const TCGTargetOpDef s2
 -                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
 -            return &s2;
 -        }
 +        return C_O1_I4(r, r, r, ri, ri);
      case INDEX_op_ld_vec:
 -    case INDEX_op_st_vec:
      case INDEX_op_dupm_vec:
 -        return &x_r;
 +        return C_O1_I1(x, r);
 +
 +    case INDEX_op_st_vec:
 +        return C_O0_I2(x, r);
      case INDEX_op_add_vec:
      case INDEX_op_sub_vec:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
  #if TCG_TARGET_REG_BITS == 32
      case INDEX_op_dup2_vec:
  #endif
 -        return &x_x_x;
 +        return C_O1_I2(x, x, x);
 +
      case INDEX_op_abs_vec:
      case INDEX_op_dup_vec:
      case INDEX_op_shli_vec:
      case INDEX_op_shri_vec:
      case INDEX_op_sari_vec:
      case INDEX_op_x86_psrldq_vec:
 -        return &x_x;
 +        return C_O1_I1(x, x);
 +
      case INDEX_op_x86_vpblendvb_vec:
 -        return &x_x_x_x;
 +        return C_O1_I3(x, x, x, x);
      default:
          break;
 --
 .25.1

-[PATCH 03/43] tcg: Move sorted_args into TCGArgConstraint.sort_index
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
-This uses an existing hole in the TCGArgConstraint structure
+This "garbage" setting pre-dates the addition of the type
-and will be convenient for keeping the data in one place.
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
 and INDEX_op_extr{l,h}_i64_i32.
+So now we have a definitive points at which to adjust z_mask
+to eliminate such bits from the 32-bit operands.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h |  2 +-
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
- tcg/tcg.c         | 35 +++++++++++++++++------------------
+file changed, 16 insertions(+), 19 deletions(-)
 files changed, 18 insertions(+), 19 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(void);
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
- typedef struct TCGArgConstraint {
+         ti->is_const = true;
-     uint16_t ct;
+         ti->val = ts->val;
-     uint8_t alias_index;
+         ti->z_mask = ts->val;
-+    uint8_t sort_index;
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-     TCGRegSet regs;
+-            /* High bits of a 32-bit quantity are garbage.  */
- } TCGArgConstraint;
+-            ti->z_mask |= ~0xffffffffull;
+-        }
-@@ -XXX,XX +XXX,XX @@ typedef struct TCGOpDef {
+     } else {
-     uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
+         ti->is_const = false;
-     uint8_t flags;
+         ti->z_mask = -1;
-     TCGArgConstraint *args_ct;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--    int *sorted_args;
+     TCGTemp *src_ts = arg_temp(src);
- #if defined(CONFIG_DEBUG_TCG)
+     TempOptInfo *di;
-     int used;
+     TempOptInfo *si;
- #endif
+-    uint64_t z_mask;
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+     TCGOpcode new_op;
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+     if (ts_are_copies(dst_ts, src_ts)) {
-+++ b/tcg/tcg.c
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-@@ -XXX,XX +XXX,XX @@ void tcg_context_init(TCGContext *s)
+     op->args[0] = dst;
-     int op, total_args, n, i;
+     op->args[1] = src;
-     TCGOpDef *def;
-     TCGArgConstraint *args_ct;
+-    z_mask = si->z_mask;
--    int *sorted_args;
+-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-     TCGTemp *ts;
+-        /* High bits of the destination are now garbage.  */
+-        z_mask |= ~0xffffffffull;
-     memset(s, 0, sizeof(*s));
+-    }
-@@ -XXX,XX +XXX,XX @@ void tcg_context_init(TCGContext *s)
+-    di->z_mask = z_mask;
 +    di->z_mask = si->z_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    /* Convert movi to mov with constant temp. */
 -    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +    TCGTemp *tv;
 +    if (ctx->type == TCG_TYPE_I32) {
 +        val = (int32_t)val;
 +    }
 +
 +    /* Convert movi to mov with constant temp. */
 +    tv = tcg_constant_internal(ctx->type, val);
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      uint64_t z_mask = ctx->z_mask;
      /*
 -     * 32-bit ops generate 32-bit results.  For the result is zero test
 -     * below, we can ignore high bits, but for further optimizations we
 -     * need to record that the high bits contain garbage.
 +     * 32-bit ops generate 32-bit results, which for the purpose of
 +     * simplifying tcg are sign-extended.  Certainly that's how we
 +     * represent our constants elsewhere.  Note that the bits will
 +     * be reset properly for a 64-bit value when encountering the
 +     * type changing opcodes.
       */
      if (ctx->type == TCG_TYPE_I32) {
 -        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
 -        a_mask &= MAKE_64BIT_MASK(0, 32);
 -        z_mask &= MAKE_64BIT_MASK(0, 32);
 +        a_mask = (int32_t)a_mask;
 +        z_mask = (int32_t)z_mask;
 +        ctx->z_mask = z_mask;
      }
-     args_ct = g_malloc(sizeof(TCGArgConstraint) * total_args);
+     if (z_mask == 0) {
 -    sorted_args = g_malloc(sizeof(int) * total_args);
      for(op = 0; op < NB_OPS; op++) {
          def = &tcg_op_defs[op];
          def->args_ct = args_ct;
 -        def->sorted_args = sorted_args;
          n = def->nb_iargs + def->nb_oargs;
 -        sorted_args += n;
          args_ct += n;
      }
@@ -XXX,XX +XXX,XX @@ static int get_constraint_priority(const TCGOpDef *def, int k)
  /* sort from highest priority to lowest */
  static void sort_constraints(TCGOpDef *def, int start, int n)
  {
 -    int i, j, p1, p2, tmp;
 +    int i, j;
 +    TCGArgConstraint *a = def->args_ct;
 -    for(i = 0; i < n; i++)
 -        def->sorted_args[start + i] = start + i;
 -    if (n <= 1)
 +    for (i = 0; i < n; i++) {
 +        a[start + i].sort_index = start + i;
 +    }
 +    if (n <= 1) {
          return;
 -    for(i = 0; i < n - 1; i++) {
 -        for(j = i + 1; j < n; j++) {
 -            p1 = get_constraint_priority(def, def->sorted_args[start + i]);
 -            p2 = get_constraint_priority(def, def->sorted_args[start + j]);
 +    }
 +    for (i = 0; i < n - 1; i++) {
 +        for (j = i + 1; j < n; j++) {
 +            int p1 = get_constraint_priority(def, a[start + i].sort_index);
 +            int p2 = get_constraint_priority(def, a[start + j].sort_index);
              if (p1 < p2) {
 -                tmp = def->sorted_args[start + i];
 -                def->sorted_args[start + i] = def->sorted_args[start + j];
 -                def->sorted_args[start + j] = tmp;
 +                int tmp = a[start + i].sort_index;
 +                a[start + i].sort_index = a[start + j].sort_index;
 +                a[start + j].sort_index = tmp;
              }
          }
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
      for (k = 0; k < nb_iargs; k++) {
          TCGRegSet i_preferred_regs, o_preferred_regs;
 -        i = def->sorted_args[nb_oargs + k];
 +        i = def->args_ct[nb_oargs + k].sort_index;
          arg = op->args[i];
          arg_ct = &def->args_ct[i];
          ts = arg_temp(arg);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                      int k2, i2;
                      reg = ts->reg;
                      for (k2 = 0 ; k2 < k ; k2++) {
 -                        i2 = def->sorted_args[nb_oargs + k2];
 +                        i2 = def->args_ct[nb_oargs + k2].sort_index;
                          if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
                              reg == new_args[i2]) {
                              goto allocate_in_reg;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
          /* satisfy the output constraints */
          for(k = 0; k < nb_oargs; k++) {
 -            i = def->sorted_args[k];
 +            i = def->args_ct[k].sort_index;
              arg = op->args[i];
              arg_ct = &def->args_ct[i];
              ts = arg_temp(arg);
 --
 .25.1

-[PATCH 27/43] tcg/tci: Add special tci_movi_{i32,i64} opcodes
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
-The normal movi opcodes are going away.  We need something
+Recognize the constant function for or-complement.
 for TCI to use internally.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-opc.h    | 8 ++++++++
+ tcg/optimize.c | 1 +
- tcg/tci.c                | 4 ++--
+file changed, 1 insertion(+)
  tcg/tci/tcg-target.c.inc | 4 ++--
 files changed, 12 insertions(+), 4 deletions(-)
-diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-opc.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg-opc.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
- #include "tcg-target.opc.h"
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
- #endif
+ {
+     if (fold_const2(ctx, op) ||
-+#ifdef TCG_TARGET_INTERPRETER
++        fold_xx_to_i(ctx, op, -1) ||
-+/* These opcodes are only for use between the tci generator and interpreter. */
+         fold_xi_to_x(ctx, op, -1) ||
-+DEF(tci_movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
+         fold_ix_to_not(ctx, op, 0)) {
-+#if TCG_TARGET_REG_BITS == 64
+         return true;
 +DEF(tci_movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
 +#endif
 +#endif
 +
  #undef TLADDR_ARGS
  #undef DATA64_ARGS
  #undef IMPL
 diff --git a/tcg/tci.c b/tcg/tci.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci.c
 +++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
              t1 = tci_read_r32(regs, &tb_ptr);
              tci_write_reg32(regs, t0, t1);
              break;
 -        case INDEX_op_movi_i32:
 +        case INDEX_op_tci_movi_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_i32(&tb_ptr);
              tci_write_reg32(regs, t0, t1);
@@ -XXX,XX +XXX,XX @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
              t1 = tci_read_r64(regs, &tb_ptr);
              tci_write_reg64(regs, t0, t1);
              break;
 -        case INDEX_op_movi_i64:
 +        case INDEX_op_tci_movi_i64:
              t0 = *tb_ptr++;
              t1 = tci_read_i64(&tb_ptr);
              tci_write_reg64(regs, t0, t1);
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
      uint8_t *old_code_ptr = s->code_ptr;
      uint32_t arg32 = arg;
      if (type == TCG_TYPE_I32 || arg == arg32) {
 -        tcg_out_op_t(s, INDEX_op_movi_i32);
 +        tcg_out_op_t(s, INDEX_op_tci_movi_i32);
          tcg_out_r(s, t0);
          tcg_out32(s, arg32);
      } else {
          tcg_debug_assert(type == TCG_TYPE_I64);
  #if TCG_TARGET_REG_BITS == 64
 -        tcg_out_op_t(s, INDEX_op_movi_i64);
 +        tcg_out_op_t(s, INDEX_op_tci_movi_i64);
          tcg_out_r(s, t0);
          tcg_out64(s, arg);
  #else
 --
 .25.1

-[PATCH 04/43] tcg: Remove TCG_CT_REG
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
-This wasn't actually used for anything, really.  All variable
+Recognize the identity function for low-part multiply.
 operands must accept registers, and which are indicated by the
 set in TCGArgConstraint.regs.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h            |  1 -
+ tcg/optimize.c | 3 ++-
- tcg/tcg.c                    | 15 ++++-----------
+file changed, 2 insertions(+), 1 deletion(-)
  tcg/aarch64/tcg-target.c.inc |  3 ---
  tcg/arm/tcg-target.c.inc     |  3 ---
  tcg/i386/tcg-target.c.inc    | 11 -----------
  tcg/mips/tcg-target.c.inc    |  3 ---
  tcg/ppc/tcg-target.c.inc     |  5 -----
  tcg/riscv/tcg-target.c.inc   |  2 --
  tcg/s390/tcg-target.c.inc    |  4 ----
  tcg/sparc/tcg-target.c.inc   |  5 -----
  tcg/tci/tcg-target.c.inc     |  1 -
 files changed, 4 insertions(+), 49 deletions(-)
-diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(void);
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
- #define TCG_CT_ALIAS  0x80
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
  #define TCG_CT_IALIAS 0x40
  #define TCG_CT_NEWREG 0x20 /* output requires a new register */
 -#define TCG_CT_REG    0x01
  #define TCG_CT_CONST  0x02 /* any constant of register size */
  typedef struct TCGArgConstraint {
 diff --git a/tcg/tcg.c b/tcg/tcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg.c
 +++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, bool have_prefs)
  /* we give more priority to constraints with less registers */
  static int get_constraint_priority(const TCGOpDef *def, int k)
  {
--    const TCGArgConstraint *arg_ct;
+     if (fold_const2(ctx, op) ||
-+    const TCGArgConstraint *arg_ct = &def->args_ct[k];
+-        fold_xi_to_i(ctx, op, 0)) {
-+    int n;
++        fold_xi_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 1)) {
--    int i, n;
+         return true;
 -    arg_ct = &def->args_ct[k];
      if (arg_ct->ct & TCG_CT_ALIAS) {
          /* an alias is equivalent to a single register */
          n = 1;
      } else {
 -        if (!(arg_ct->ct & TCG_CT_REG))
 -            return 0;
 -        n = 0;
 -        for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
 -            if (tcg_regset_test_reg(arg_ct->regs, i))
 -                n++;
 -        }
 +        n = ctpop64(arg_ct->regs);
      }
-     return TCG_TARGET_NB_REGS - n + 1;
+     return false;
  }
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
                          int oarg = *ct_str - '0';
                          tcg_debug_assert(ct_str == tdefs->args_ct_str[i]);
                          tcg_debug_assert(oarg < def->nb_oargs);
 -                        tcg_debug_assert(def->args_ct[oarg].ct & TCG_CT_REG);
 +                        tcg_debug_assert(def->args_ct[oarg].regs != 0);
                          /* TCG_CT_ALIAS is for the output arguments.
                             The input is tagged with TCG_CT_IALIAS. */
                          def->args_ct[i] = def->args_ct[oarg];
 diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/aarch64/tcg-target.c.inc
 +++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch (*ct_str++) {
      case 'r': /* general registers */
 -        ct->ct |= TCG_CT_REG;
          ct->regs |= 0xffffffffu;
          break;
      case 'w': /* advsimd registers */
 -        ct->ct |= TCG_CT_REG;
          ct->regs |= 0xffffffff00000000ull;
          break;
      case 'l': /* qemu_ld / qemu_st address, data_reg */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffffu;
  #ifdef CONFIG_SOFTMMU
          /* x0 and x1 will be overwritten when reading the tlb entry,
 diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/arm/tcg-target.c.inc
 +++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
          break;
      case 'r':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffff;
          break;
      /* qemu_ld address */
      case 'l':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffff;
  #ifdef CONFIG_SOFTMMU
          /* r0-r2,lr will be overwritten when reading the tlb entry,
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      /* qemu_st address & data */
      case 's':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffff;
          /* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
             and r0-r1 doing the byte swapping, so don't use these. */
 diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/i386/tcg-target.c.inc
 +++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch(*ct_str++) {
      case 'a':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
          break;
      case 'b':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
          break;
      case 'c':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
          break;
      case 'd':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
          break;
      case 'S':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
          break;
      case 'D':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
          break;
      case 'q':
          /* A register that can be used as a byte operand.  */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
          break;
      case 'Q':
          /* A register with an addressable second byte (e.g. %ah).  */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xf;
          break;
      case 'r':
          /* A general register.  */
 -        ct->ct |= TCG_CT_REG;
          ct->regs |= ALL_GENERAL_REGS;
          break;
      case 'W':
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
          break;
      case 'x':
          /* A vector register.  */
 -        ct->ct |= TCG_CT_REG;
          ct->regs |= ALL_VECTOR_REGS;
          break;
          /* qemu_ld/st address constraint */
      case 'L':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
          tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
          tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch(*ct_str++) {
      case 'r':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          break;
      case 'L': /* qemu_ld input arg constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
  #if defined(CONFIG_SOFTMMU)
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  #endif
          break;
      case 'S': /* qemu_st constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
  #if defined(CONFIG_SOFTMMU)
 diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch (*ct_str++) {
      case 'A': case 'B': case 'C': case 'D':
 -        ct->ct |= TCG_CT_REG;
          tcg_regset_set_reg(ct->regs, 3 + ct_str[0] - 'A');
          break;
      case 'r':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          break;
      case 'v':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff00000000ull;
          break;
      case 'L':                   /* qemu_ld constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
  #ifdef CONFIG_SOFTMMU
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  #endif
          break;
      case 'S':                   /* qemu_st constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
  #ifdef CONFIG_SOFTMMU
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch (*ct_str++) {
      case 'r':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          break;
      case 'L':
          /* qemu_ld/qemu_st constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          /* qemu_ld/qemu_st uses TCG_REG_TMP0 */
  #if defined(CONFIG_SOFTMMU)
 diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.c.inc
 +++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch (*ct_str++) {
      case 'r':                  /* all registers */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffff;
          break;
      case 'L':                  /* qemu_ld/st constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffff;
          tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
          tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
          tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
          break;
      case 'a':                  /* force R2 for division */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0;
          tcg_regset_set_reg(ct->regs, TCG_REG_R2);
          break;
      case 'b':                  /* force R3 for division */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0;
          tcg_regset_set_reg(ct->regs, TCG_REG_R3);
          break;
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
  {
      switch (*ct_str++) {
      case 'r':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          break;
      case 'R':
 -        ct->ct |= TCG_CT_REG;
          ct->regs = ALL_64;
          break;
      case 'A': /* qemu_ld/st address constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = TARGET_LONG_BITS == 64 ? ALL_64 : 0xffffffff;
      reserve_helpers:
          tcg_regset_reset_reg(ct->regs, TCG_REG_O0);
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
          tcg_regset_reset_reg(ct->regs, TCG_REG_O2);
          break;
      case 's': /* qemu_st data 32-bit constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = 0xffffffff;
          goto reserve_helpers;
      case 'S': /* qemu_st data 64-bit constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = ALL_64;
          goto reserve_helpers;
      case 'I':
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      case 'r':
      case 'L':                   /* qemu_ld constraint */
      case 'S':                   /* qemu_st constraint */
 -        ct->ct |= TCG_CT_REG;
          ct->regs = BIT(TCG_TARGET_NB_REGS) - 1;
          break;
      default:
 --
 .25.1

-[PATCH 32/43] tcg/ppc: Use tcg_constant_vec with tcg vec expanders
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
-Improve expand_vec_shi to use sign-extraction for MO_32.
+Recognize the identity function for division.
 This allows a single VSPLTISB instruction to load all of
 the valid shift constants.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/ppc/tcg-target.c.inc | 44 ++++++++++++++++++++++++----------------
+ tcg/optimize.c | 6 +++++-
-file changed, 27 insertions(+), 17 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/ppc/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/ppc/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
- static void expand_vec_shi(TCGType type, unsigned vece, TCGv_vec v0,
-                            TCGv_vec v1, TCGArg imm, TCGOpcode opci)
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
  {
--    TCGv_vec t1 = tcg_temp_new_vec(type);
+-    return fold_const2(ctx, op);
-+    TCGv_vec t1;
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 1)) {
--    /* Splat w/bytes for xxspltib.  */
++        return true;
 -    tcg_gen_dupi_vec(MO_8, t1, imm & ((8 << vece) - 1));
 +    if (vece == MO_32) {
 +        /*
 +         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
 +         * So using negative numbers gets us the 4th bit easily.
 +         */
 +        imm = sextract32(imm, 0, 5);
 +    } else {
 +        imm &= (8 << vece) - 1;
 +    }
-+
++    return false;
 +    /* Splat w/bytes for xxspltib when 2.07 allows MO_64. */
 +    t1 = tcg_constant_vec(type, MO_8, imm);
      vec_gen_3(opci, type, vece, tcgv_vec_arg(v0),
                tcgv_vec_arg(v1), tcgv_vec_arg(t1));
 -    tcg_temp_free_vec(t1);
  }
- static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
  {
      TCGv_vec t1 = tcg_temp_new_vec(type);
      TCGv_vec t2 = tcg_temp_new_vec(type);
 -    TCGv_vec t3, t4;
 +    TCGv_vec c0, c16;
      switch (vece) {
      case MO_8:
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
      case MO_32:
          tcg_debug_assert(!have_isa_2_07);
 -        t3 = tcg_temp_new_vec(type);
 -        t4 = tcg_temp_new_vec(type);
 -        tcg_gen_dupi_vec(MO_8, t4, -16);
 +        /*
 +         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
 +         * So using -16 is a quick way to represent 16.
 +         */
 +        c16 = tcg_constant_vec(type, MO_8, -16);
 +        c0 = tcg_constant_vec(type, MO_8, 0);
 +
          vec_gen_3(INDEX_op_rotlv_vec, type, MO_32, tcgv_vec_arg(t1),
 -                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
 +                  tcgv_vec_arg(v2), tcgv_vec_arg(c16));
          vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
                    tcgv_vec_arg(v1), tcgv_vec_arg(v2));
 -        tcg_gen_dupi_vec(MO_8, t3, 0);
 -        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
 -                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
 -        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
 -                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
 -        tcg_gen_add_vec(MO_32, v0, t2, t3);
 -        tcg_temp_free_vec(t3);
 -        tcg_temp_free_vec(t4);
 +        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t1),
 +                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(c0));
 +        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t1),
 +                  tcgv_vec_arg(t1), tcgv_vec_arg(c16));
 +        tcg_gen_add_vec(MO_32, v0, t1, t2);
          break;
      default:
 --
 .25.1

-[PATCH 08/43] tcg: Fix generation of dupi_vec for 32-bit host
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-The definition of INDEX_op_dupi_vec is that it operates on
+Recognize the constant function for remainder.
 units of tcg_target_ulong -- in this case 32 bits.  It does
 not work to use this for a uint64_t value that happens to be
 small enough to fit in tcg_target_ulong.
-Fixes: d2fd745fe8b
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-Fixes: db432672dc5
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg-op-vec.c | 12 ++++++++----
+ tcg/optimize.c | 6 +++++-
-file changed, 8 insertions(+), 4 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg-op-vec.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg-op-vec.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
- void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  {
--    if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
+-    return fold_const2(ctx, op);
--        do_dupi_vec(r, MO_32, a);
++    if (fold_const2(ctx, op) ||
--    } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
++        fold_xx_to_i(ctx, op, 0)) {
-+    if (TCG_TARGET_REG_BITS == 64) {
++        return true;
          do_dupi_vec(r, MO_64, a);
 +    } else if (a == dup_const(MO_32, a)) {
 +        do_dupi_vec(r, MO_32, a);
      } else {
          TCGv_i64 c = tcg_const_i64(a);
          tcg_gen_dup_i64_vec(MO_64, r, c);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
  void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
  {
 -    do_dupi_vec(r, MO_REG, dup_const(vece, a));
 +    if (vece == MO_64) {
 +        tcg_gen_dup64i_vec(r, a);
 +    } else {
 +        do_dupi_vec(r, MO_REG, dup_const(vece, a));
 +    }
++    return false;
  }
- void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 --
 .25.1

-[PATCH 02/43] tcg: Drop union from TCGArgConstraint
+[PULL 52/56] tcg/optimize: Optimize sign extensions
-The union is unused; let "regs" appear in the main structure
+Certain targets, like riscv, produce signed 32-bit results.
-without the "u.regs" wrapping.
+This can lead to lots of redundant extensions as values are
+manipulated.
 Begin by tracking only the obvious sign-extensions, and
 converting them to simple copies when possible.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg.h            |  4 +---
+ tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
- tcg/tcg.c                    | 22 +++++++++++-----------
+file changed, 102 insertions(+), 21 deletions(-)
- tcg/aarch64/tcg-target.c.inc | 14 +++++++-------
- tcg/arm/tcg-target.c.inc     | 26 +++++++++++++-------------
+diff --git a/tcg/optimize.c b/tcg/optimize.c
  tcg/i386/tcg-target.c.inc    | 26 +++++++++++++-------------
  tcg/mips/tcg-target.c.inc    | 18 +++++++++---------
  tcg/ppc/tcg-target.c.inc     | 24 ++++++++++++------------
  tcg/riscv/tcg-target.c.inc   | 14 +++++++-------
  tcg/s390/tcg-target.c.inc    | 18 +++++++++---------
  tcg/sparc/tcg-target.c.inc   | 16 ++++++++--------
  tcg/tci/tcg-target.c.inc     |  2 +-
 files changed, 91 insertions(+), 93 deletions(-)
 diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(void);
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
- typedef struct TCGArgConstraint {
+     TCGTemp *next_copy;
-     uint16_t ct;
+     uint64_t val;
-     uint8_t alias_index;
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
--    union {
++    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
--        TCGRegSet regs;
+ } TempOptInfo;
--    } u;
-+    TCGRegSet regs;
+ typedef struct OptContext {
- } TCGArgConstraint;
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     /* In flight values from optimization. */
- #define TCG_MAX_OP_ARGS 16
+     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-index XXXXXXX..XXXXXXX 100644
++    uint64_t s_mask;  /* mask of clrsb(value) bits */
---- a/tcg/tcg.c
+     TCGType type;
-+++ b/tcg/tcg.c
+ } OptContext;
-@@ -XXX,XX +XXX,XX @@ static int get_constraint_priority(const TCGOpDef *def, int k)
-             return 0;
++/* Calculate the smask for a specific value. */
-         n = 0;
++static uint64_t smask_from_value(uint64_t value)
-         for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
++{
--            if (tcg_regset_test_reg(arg_ct->u.regs, i))
++    int rep = clrsb64(value);
-+            if (tcg_regset_test_reg(arg_ct->regs, i))
++    return ~(~0ull >> rep);
-                 n++;
++}
 +
 +/*
 + * Calculate the smask for a given set of known-zeros.
 + * If there are lots of zeros on the left, we can consider the remainder
 + * an unsigned field, and thus the corresponding signed field is one bit
 + * larger.
 + */
 +static uint64_t smask_from_zmask(uint64_t zmask)
 +{
 +    /*
 +     * Only the 0 bits are significant for zmask, thus the msb itself
 +     * must be zero, else we have no sign information.
 +     */
 +    int rep = clz64(zmask);
 +    if (rep == 0) {
 +        return 0;
 +    }
 +    rep -= 1;
 +    return ~(~0ull >> rep);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->prev_copy = ts;
      ti->is_const = false;
      ti->z_mask = -1;
 +    ti->s_mask = 0;
  }
  static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
 +        ti->s_mask = smask_from_value(ts->val);
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
 +        ti->s_mask = 0;
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[1] = src;
      di->z_mask = si->z_mask;
 +    di->s_mask = si->s_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
 -        reset_temp(op->args[i]);
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        reset_ts(ts);
          /*
 -         * Save the corresponding known-zero bits mask for the
 +         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
-@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
+ }
-             /* Incomplete TCGTargetOpDef entry. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
-             tcg_debug_assert(ct_str != NULL);
+ {
+     uint64_t a_mask = ctx->a_mask;
--            def->args_ct[i].u.regs = 0;
+     uint64_t z_mask = ctx->z_mask;
-+            def->args_ct[i].regs = 0;
++    uint64_t s_mask = ctx->s_mask;
-             def->args_ct[i].ct = 0;
-             while (*ct_str != '\0') {
+     /*
-                 switch(*ct_str) {
+      * 32-bit ops generate 32-bit results, which for the purpose of
-@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
-                     pset = la_temp_pref(ts);
+     if (ctx->type == TCG_TYPE_I32) {
-                     set = *pset;
+         a_mask = (int32_t)a_mask;
+         z_mask = (int32_t)z_mask;
--                    set &= ct->u.regs;
++        s_mask |= MAKE_64BIT_MASK(32, 32);
-+                    set &= ct->regs;
+         ctx->z_mask = z_mask;
-                     if (ct->ct & TCG_CT_IALIAS) {
++        ctx->s_mask = s_mask;
-                         set &= op->output_pref[ct->alias_index];
+     }
-                     }
-                     /* If the combination is not possible, restart.  */
+     if (z_mask == 0) {
-                     if (set == 0) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
--                        set = ct->u.regs;
-+                        set = ct->regs;
+ static bool fold_bswap(OptContext *ctx, TCGOp *op)
-                     }
+ {
-                     *pset = set;
+-    uint64_t z_mask, sign;
-                 }
++    uint64_t z_mask, s_mask, sign;
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
-         return;
+     if (arg_is_const(op->args[1])) {
-     }
+         uint64_t t = arg_info(op->args[1])->val;
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
--    dup_out_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[0].u.regs;
+     }
--    dup_in_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[1].u.regs;
-+    dup_out_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
+     z_mask = arg_info(op->args[1])->z_mask;
-+    dup_in_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[1].regs;
++
+     switch (op->opc) {
-     /* Allocate the output register now.  */
+     case INDEX_op_bswap16_i32:
-     if (ots->val_type != TEMP_VAL_REG) {
+     case INDEX_op_bswap16_i64:
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
-             }
+     default:
          g_assert_not_reached();
      }
 +    s_mask = smask_from_zmask(z_mask);
      switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
      case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
          /* If the sign bit may be 1, force all the bits above to 1. */
          if (z_mask & sign) {
              z_mask |= sign;
 +            s_mask = sign << 1;
          }
+         break;
--        temp_load(s, ts, arg_ct->u.regs, i_allocated_regs, i_preferred_regs);
+     default:
-+        temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
+         /* The high bits are undefined: force all bits above the sign to 1. */
-         reg = ts->reg;
+         z_mask |= sign << 1;
++        s_mask = 0;
--        if (tcg_regset_test_reg(arg_ct->u.regs, reg)) {
+         break;
-+        if (tcg_regset_test_reg(arg_ct->regs, reg)) {
+     }
-             /* nothing to do : the constraint is satisfied */
+     ctx->z_mask = z_mask;
-         } else {
++    ctx->s_mask = s_mask;
-         allocate_in_reg:
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+     return fold_masks(ctx, op);
-                and move the temporary register into it */
+ }
-             temp_load(s, ts, tcg_target_available_regs[ts->type],
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-                       i_allocated_regs, 0);
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
--            reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
+ {
-+            reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
+     uint64_t z_mask_old, z_mask;
-                                 o_preferred_regs, ts->indirect_base);
++    int pos = op->args[2];
-             if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
++    int len = op->args[3];
-                 /*
-@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
+     if (arg_is_const(op->args[1])) {
-                 && !const_args[arg_ct->alias_index]) {
+         uint64_t t;
-                 reg = new_args[arg_ct->alias_index];
-             } else if (arg_ct->ct & TCG_CT_NEWREG) {
+         t = arg_info(op->args[1])->val;
--                reg = tcg_reg_alloc(s, arg_ct->u.regs,
+-        t = extract64(t, op->args[2], op->args[3]);
-+                reg = tcg_reg_alloc(s, arg_ct->regs,
++        t = extract64(t, pos, len);
-                                     i_allocated_regs | o_allocated_regs,
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-                                     op->output_pref[k], ts->indirect_base);
+     }
-             } else {
--                reg = tcg_reg_alloc(s, arg_ct->u.regs, o_allocated_regs,
+     z_mask_old = arg_info(op->args[1])->z_mask;
-+                reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
+-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-                                     op->output_pref[k], ts->indirect_base);
+-    if (op->args[2] == 0) {
-             }
++    z_mask = extract64(z_mask_old, pos, len);
-             tcg_regset_set_reg(o_allocated_regs, reg);
++    if (pos == 0) {
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+         ctx->a_mask = z_mask_old ^ z_mask;
-index XXXXXXX..XXXXXXX 100644
+     }
---- a/tcg/aarch64/tcg-target.c.inc
+     ctx->z_mask = z_mask;
-+++ b/tcg/aarch64/tcg-target.c.inc
++    ctx->s_mask = smask_from_zmask(z_mask);
-@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
-     switch (*ct_str++) {
+     return fold_masks(ctx, op);
-     case 'r': /* general registers */
+ }
-         ct->ct |= TCG_CT_REG;
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
--        ct->u.regs |= 0xffffffffu;
-+        ct->regs |= 0xffffffffu;
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
-         break;
+ {
-     case 'w': /* advsimd registers */
+-    uint64_t z_mask_old, z_mask, sign;
-         ct->ct |= TCG_CT_REG;
++    uint64_t s_mask_old, s_mask, z_mask, sign;
--        ct->u.regs |= 0xffffffff00000000ull;
+     bool type_change = false;
-+        ct->regs |= 0xffffffff00000000ull;
-         break;
+     if (fold_const1(ctx, op)) {
-     case 'l': /* qemu_ld / qemu_st address, data_reg */
+         return true;
-         ct->ct |= TCG_CT_REG;
+     }
--        ct->u.regs = 0xffffffffu;
-+        ct->regs = 0xffffffffu;
+-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
- #ifdef CONFIG_SOFTMMU
++    z_mask = arg_info(op->args[1])->z_mask;
-         /* x0 and x1 will be overwritten when reading the tlb entry,
++    s_mask = arg_info(op->args[1])->s_mask;
-            and x2, and x3 for helper args, better to avoid using them. */
++    s_mask_old = s_mask;
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X0);
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X1);
+     switch (op->opc) {
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X2);
+     CASE_OP_32_64(ext8s):
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X3);
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_X0);
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_X1);
+     if (z_mask & sign) {
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_X2);
+         z_mask |= sign;
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_X3);
+-    } else if (!type_change) {
- #endif
+-        ctx->a_mask = z_mask_old ^ z_mask;
-         break;
+     }
-     case 'A': /* Valid for arithmetic immediate (positive or negative).  */
++    s_mask |= sign << 1;
-diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
++
-index XXXXXXX..XXXXXXX 100644
+     ctx->z_mask = z_mask;
---- a/tcg/arm/tcg-target.c.inc
++    ctx->s_mask = s_mask;
-+++ b/tcg/arm/tcg-target.c.inc
++    if (!type_change) {
-@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
++        ctx->a_mask = s_mask & ~s_mask_old;
++    }
-     case 'r':
-         ct->ct |= TCG_CT_REG;
+     return fold_masks(ctx, op);
--        ct->u.regs = 0xffff;
+ }
-+        ct->regs = 0xffff;
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
-         break;
+     }
-     /* qemu_ld address */
+     ctx->z_mask = z_mask;
-     case 'l':
++    ctx->s_mask = smask_from_zmask(z_mask);
-         ct->ct |= TCG_CT_REG;
+     if (!type_change) {
--        ct->u.regs = 0xffff;
+         ctx->a_mask = z_mask_old ^ z_mask;
-+        ct->regs = 0xffff;
+     }
- #ifdef CONFIG_SOFTMMU
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-         /* r0-r2,lr will be overwritten when reading the tlb entry,
+     MemOp mop = get_memop(oi);
-            so don't use these. */
+     int width = 8 * memop_size(mop);
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
+-    if (!(mop & MO_SIGN) && width < 64) {
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
+-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
++    if (width < 64) {
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
++        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R0);
++        if (!(mop & MO_SIGN)) {
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R1);
++            ctx->z_mask = MAKE_64BIT_MASK(0, width);
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
++            ctx->s_mask <<= 1;
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
++        }
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R14);
+     }
- #endif
-         break;
+     /* Opcodes that touch guest memory stop the mb optimization.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
-     /* qemu_st address & data */
-     case 's':
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-         ct->ct |= TCG_CT_REG;
+ {
--        ct->u.regs = 0xffff;
+-    int64_t z_mask_old, z_mask;
-+        ct->regs = 0xffff;
++    uint64_t z_mask, s_mask, s_mask_old;
-         /* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
++    int pos = op->args[2];
-            and r0-r1 doing the byte swapping, so don't use these. */
++    int len = op->args[3];
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
+     if (arg_is_const(op->args[1])) {
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R0);
+         uint64_t t;
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R1);
- #if defined(CONFIG_SOFTMMU)
+         t = arg_info(op->args[1])->val;
-         /* Avoid clashes with registers being used for helper args */
+-        t = sextract64(t, op->args[2], op->args[3]);
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
++        t = sextract64(t, pos, len);
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
- #if TARGET_LONG_BITS == 64
+     }
-         /* Avoid clashes with registers being used for helper args */
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
+-    z_mask_old = arg_info(op->args[1])->z_mask;
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
+-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
- #endif
+-    if (op->args[2] == 0 && z_mask >= 0) {
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
+-        ctx->a_mask = z_mask_old ^ z_mask;
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_R14);
+-    }
- #endif
++    z_mask = arg_info(op->args[1])->z_mask;
-         break;
++    z_mask = sextract64(z_mask, pos, len);
+     ctx->z_mask = z_mask;
-diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
-index XXXXXXX..XXXXXXX 100644
++    s_mask_old = arg_info(op->args[1])->s_mask;
---- a/tcg/i386/tcg-target.c.inc
++    s_mask = sextract64(s_mask_old, pos, len);
-+++ b/tcg/i386/tcg-target.c.inc
++    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
-@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
++    ctx->s_mask = s_mask;
-     switch(*ct_str++) {
++
-     case 'a':
++    if (pos == 0) {
-         ct->ct |= TCG_CT_REG;
++        ctx->a_mask = s_mask & ~s_mask_old;
--        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
++    }
-+        tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
++
-         break;
+     return fold_masks(ctx, op);
-     case 'b':
+ }
-         ct->ct |= TCG_CT_REG;
--        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
-+        tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
+ {
-         break;
+     /* We can't do any folding with a load, but we can record bits. */
-     case 'c':
+     switch (op->opc) {
-         ct->ct |= TCG_CT_REG;
++    CASE_OP_32_64(ld8s):
--        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
++        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
-+        tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
++        break;
-         break;
+     CASE_OP_32_64(ld8u):
-     case 'd':
+         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
-         ct->ct |= TCG_CT_REG;
++        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
--        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
++        break;
-+        tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
++    CASE_OP_32_64(ld16s):
-         break;
++        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
-     case 'S':
+         break;
-         ct->ct |= TCG_CT_REG;
+     CASE_OP_32_64(ld16u):
--        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
+         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
-+        tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
++        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
-         break;
++        break;
-     case 'D':
++    case INDEX_op_ld32s_i64:
-         ct->ct |= TCG_CT_REG;
++        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
--        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
+         break;
-+        tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
+     case INDEX_op_ld32u_i64:
-         break;
+         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
-     case 'q':
++        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
-         /* A register that can be used as a byte operand.  */
+         break;
-         ct->ct |= TCG_CT_REG;
+     default:
--        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
+         g_assert_not_reached();
-+        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         break;
+             ctx.type = TCG_TYPE_I32;
      case 'Q':
          /* A register with an addressable second byte (e.g. %ah).  */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xf;
 +        ct->regs = 0xf;
          break;
      case 'r':
          /* A general register.  */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs |= ALL_GENERAL_REGS;
 +        ct->regs |= ALL_GENERAL_REGS;
          break;
      case 'W':
          /* With TZCNT/LZCNT, we can have operand-size as an input.  */
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      case 'x':
          /* A vector register.  */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs |= ALL_VECTOR_REGS;
 +        ct->regs |= ALL_VECTOR_REGS;
          break;
          /* qemu_ld/st address constraint */
      case 'L':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
 +        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
          break;
      case 'e':
 diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/mips/tcg-target.c.inc
 +++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      switch(*ct_str++) {
      case 'r':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 +        ct->regs = 0xffffffff;
          break;
      case 'L': /* qemu_ld input arg constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
 +        ct->regs = 0xffffffff;
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
  #if defined(CONFIG_SOFTMMU)
          if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
 -            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
 +            tcg_regset_reset_reg(ct->regs, TCG_REG_A2);
          }
- #endif
-         break;
+-        /* Assume all bits affected, and no bits known zero. */
-     case 'S': /* qemu_st constraint */
++        /* Assume all bits affected, no bits known zero, no sign reps. */
-         ct->ct |= TCG_CT_REG;
+         ctx.a_mask = -1;
--        ct->u.regs = 0xffffffff;
+         ctx.z_mask = -1;
--        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
++        ctx.s_mask = 0;
-+        ct->regs = 0xffffffff;
-+        tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
+         /*
- #if defined(CONFIG_SOFTMMU)
+          * Process each opcode.
-         if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
+         case INDEX_op_extrh_i64_i32:
--            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A3);
+             done = fold_extu(&ctx, op);
-+            tcg_regset_reset_reg(ct->regs, TCG_REG_A2);
+             break;
-+            tcg_regset_reset_reg(ct->regs, TCG_REG_A3);
++        CASE_OP_32_64(ld8s):
-         } else {
+         CASE_OP_32_64(ld8u):
--            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
++        CASE_OP_32_64(ld16s):
-+            tcg_regset_reset_reg(ct->regs, TCG_REG_A1);
+         CASE_OP_32_64(ld16u):
-         }
++        case INDEX_op_ld32s_i64:
- #endif
+         case INDEX_op_ld32u_i64:
-         break;
+             done = fold_tcg_ld(&ctx, op);
-diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
+             break;
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/ppc/tcg-target.c.inc
 +++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      switch (*ct_str++) {
      case 'A': case 'B': case 'C': case 'D':
          ct->ct |= TCG_CT_REG;
 -        tcg_regset_set_reg(ct->u.regs, 3 + ct_str[0] - 'A');
 +        tcg_regset_set_reg(ct->regs, 3 + ct_str[0] - 'A');
          break;
      case 'r':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 +        ct->regs = 0xffffffff;
          break;
      case 'v':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff00000000ull;
 +        ct->regs = 0xffffffff00000000ull;
          break;
      case 'L':                   /* qemu_ld constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 +        ct->regs = 0xffffffff;
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
  #ifdef CONFIG_SOFTMMU
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R5);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R5);
  #endif
          break;
      case 'S':                   /* qemu_st constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 +        ct->regs = 0xffffffff;
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
  #ifdef CONFIG_SOFTMMU
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R5);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R6);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R5);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R6);
  #endif
          break;
      case 'I':
 diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/riscv/tcg-target.c.inc
 +++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      switch (*ct_str++) {
      case 'r':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 +        ct->regs = 0xffffffff;
          break;
      case 'L':
          /* qemu_ld/qemu_st constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 +        ct->regs = 0xffffffff;
          /* qemu_ld/qemu_st uses TCG_REG_TMP0 */
  #if defined(CONFIG_SOFTMMU)
 -        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[0]);
 -        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[1]);
 -        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[2]);
 -        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[3]);
 -        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[4]);
 +        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[0]);
 +        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[1]);
 +        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[2]);
 +        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[3]);
 +        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[4]);
  #endif
          break;
      case 'I':
 diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/s390/tcg-target.c.inc
 +++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      switch (*ct_str++) {
      case 'r':                  /* all registers */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffff;
 +        ct->regs = 0xffff;
          break;
      case 'L':                  /* qemu_ld/st constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffff;
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
 +        ct->regs = 0xffff;
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
          break;
      case 'a':                  /* force R2 for division */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0;
 -        tcg_regset_set_reg(ct->u.regs, TCG_REG_R2);
 +        ct->regs = 0;
 +        tcg_regset_set_reg(ct->regs, TCG_REG_R2);
          break;
      case 'b':                  /* force R3 for division */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0;
 -        tcg_regset_set_reg(ct->u.regs, TCG_REG_R3);
 +        ct->regs = 0;
 +        tcg_regset_set_reg(ct->regs, TCG_REG_R3);
          break;
      case 'A':
          ct->ct |= TCG_CT_CONST_S33;
 diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/sparc/tcg-target.c.inc
 +++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      switch (*ct_str++) {
      case 'r':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 +        ct->regs = 0xffffffff;
          break;
      case 'R':
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = ALL_64;
 +        ct->regs = ALL_64;
          break;
      case 'A': /* qemu_ld/st address constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = TARGET_LONG_BITS == 64 ? ALL_64 : 0xffffffff;
 +        ct->regs = TARGET_LONG_BITS == 64 ? ALL_64 : 0xffffffff;
      reserve_helpers:
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O0);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O1);
 -        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O2);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_O0);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_O1);
 +        tcg_regset_reset_reg(ct->regs, TCG_REG_O2);
          break;
      case 's': /* qemu_st data 32-bit constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = 0xffffffff;
 +        ct->regs = 0xffffffff;
          goto reserve_helpers;
      case 'S': /* qemu_st data 64-bit constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = ALL_64;
 +        ct->regs = ALL_64;
          goto reserve_helpers;
      case 'I':
          ct->ct |= TCG_CT_CONST_S11;
 diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tci/tcg-target.c.inc
 +++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
      case 'L':                   /* qemu_ld constraint */
      case 'S':                   /* qemu_st constraint */
          ct->ct |= TCG_CT_REG;
 -        ct->u.regs = BIT(TCG_TARGET_NB_REGS) - 1;
 +        ct->regs = BIT(TCG_TARGET_NB_REGS) - 1;
          break;
      default:
          return NULL;
 --
 .25.1

-[PATCH 22/43] tcg: Convert tcg_gen_dupi_vec to TCG_CONST
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-Because we now store uint64_t in TCGTemp, we can now always
+Sign repetitions are perforce all identical, whether they are 1 or 0.
-store the full 64-bit duplicate immediate.  So remove the
+Bitwise operations preserve the relative quantity of the repetitions.
 difference between 32- and 64-bit hosts.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c   |  9 ++++-----
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
- tcg/tcg-op-vec.c | 39 ++++++++++-----------------------------
+file changed, 29 insertions(+)
  tcg/tcg.c        |  7 +------
 files changed, 15 insertions(+), 40 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-         case INDEX_op_dup2_vec:
+     z2 = arg_info(op->args[2])->z_mask;
-             assert(TCG_TARGET_REG_BITS == 32);
+     ctx->z_mask = z1 & z2;
-             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
--                tmp = arg_info(op->args[1])->val;
++    /*
--                if (tmp == arg_info(op->args[2])->val) {
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
--                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++     * Bitwise operations preserve the relative quantity of the repetitions.
--                    break;
++     */
--                }
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
++                & arg_info(op->args[2])->s_mask;
-+                                 deposit64(arg_info(op->args[1])->val, 32, 32,
++
-+                                           arg_info(op->args[2])->val));
+     /*
-+                break;
+      * Known-zeros does not imply known-ones.  Therefore unless
-             } else if (args_are_copies(op->args[1], op->args[2])) {
+      * arg2 is constant, we can't infer affected bits from it.
-                 op->opc = INDEX_op_dup_vec;
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
                  TCGOP_VECE(op) = MO_32;
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
      }
+     ctx->z_mask = z1;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
++                & arg_info(op->args[2])->s_mask;
+     return fold_masks(ctx, op);
  }
--#define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
--
+         fold_xi_to_not(ctx, op, 0)) {
--static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
+         return true;
--{
+     }
--    TCGTemp *rt = tcgv_vec_temp(r);
++
--    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--}
++                & arg_info(op->args[2])->s_mask;
--
+     return false;
  TCGv_vec tcg_const_zeros_vec(TCGType type)
  {
      TCGv_vec ret = tcg_temp_new_vec(type);
 -    do_dupi_vec(ret, MO_REG, 0);
 +    tcg_gen_dupi_vec(MO_64, ret, 0);
      return ret;
  }
- TCGv_vec tcg_const_ones_vec(TCGType type)
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
- {
-     TCGv_vec ret = tcg_temp_new_vec(type);
+     ctx->z_mask = arg_info(op->args[3])->z_mask
--    do_dupi_vec(ret, MO_REG, -1);
+                 | arg_info(op->args[4])->z_mask;
-+    tcg_gen_dupi_vec(MO_64, ret, -1);
++    ctx->s_mask = arg_info(op->args[3])->s_mask
-     return ret;
++                & arg_info(op->args[4])->s_mask;
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 +
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
-@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
+@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
+         fold_xi_to_not(ctx, op, 0)) {
- void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+         return true;
- {
+     }
--    if (TCG_TARGET_REG_BITS == 64) {
++
--        do_dupi_vec(r, MO_64, a);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--    } else if (a == dup_const(MO_32, a)) {
++                & arg_info(op->args[2])->s_mask;
--        do_dupi_vec(r, MO_32, a);
+     return false;
 -    } else {
 -        TCGv_i64 c = tcg_const_i64(a);
 -        tcg_gen_dup_i64_vec(MO_64, r, c);
 -        tcg_temp_free_i64(c);
 -    }
 +    tcg_gen_dupi_vec(MO_64, r, a);
  }
- void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
- {
+         return true;
--    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
+     }
-+    tcg_gen_dupi_vec(MO_32, r, a);
 +    ctx->s_mask = arg_info(op->args[1])->s_mask;
 +
      /* Because of fold_to_not, we want to always return true, via finish. */
      finish_folding(ctx, op);
      return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
- void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
- {
+         fold_ix_to_not(ctx, op, 0)) {
--    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
+         return true;
-+    tcg_gen_dupi_vec(MO_16, r, a);
+     }
 +
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
- void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
- {
--    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
+     ctx->z_mask = arg_info(op->args[1])->z_mask
-+    tcg_gen_dupi_vec(MO_8, r, a);
+                 | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
- void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
- {
--    if (vece == MO_64) {
--        tcg_gen_dup64i_vec(r, a);
--    } else {
--        do_dupi_vec(r, MO_REG, dup_const(vece, a));
--    }
-+    TCGTemp *rt = tcgv_vec_temp(r);
-+    tcg_gen_mov_vec(r, tcg_constant_vec(rt->base_type, vece, a));
- }
- void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
-             if (tcg_can_emit_vec_op(INDEX_op_sari_vec, type, vece) > 0) {
-                 tcg_gen_sari_vec(vece, t, a, (8 << vece) - 1);
-             } else {
--                do_dupi_vec(t, MO_REG, 0);
--                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a, t);
-+                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a,
-+                                tcg_constant_vec(type, vece, 0));
-             }
-             tcg_gen_xor_vec(vece, r, a, t);
-             tcg_gen_sub_vec(vece, r, r, t);
-diff --git a/tcg/tcg.c b/tcg/tcg.c
-index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
-+++ b/tcg/tcg.c
-@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
-              * The targets will, in general, have to do this search anyway,
-              * do this generically.
-              */
--            if (TCG_TARGET_REG_BITS == 32) {
--                val = dup_const(MO_32, val);
--                vece = MO_32;
--            }
-             if (val == dup_const(MO_8, val)) {
-                 vece = MO_8;
-             } else if (val == dup_const(MO_16, val)) {
-                 vece = MO_16;
--            } else if (TCG_TARGET_REG_BITS == 64 &&
--                       val == dup_const(MO_32, val)) {
-+            } else if (val == dup_const(MO_32, val)) {
-                 vece = MO_32;
-             }
 --
 .25.1

-[PATCH 23/43] tcg: Use tcg_constant_i32 with icount expander
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-We must do this before we adjust tcg_out_movi_i32, lest the
+The result is either 0 or 1, which means that we have
-under-the-hood poking that we do for icount be broken.
+a 2 bit signed result, and thus 62 bits of sign.
 For clarity, use the smask_from_zmask function.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/gen-icount.h | 25 +++++++++++++------------
+ tcg/optimize.c | 2 ++
-file changed, 13 insertions(+), 12 deletions(-)
+file changed, 2 insertions(+)
-diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/gen-icount.h
+--- a/tcg/optimize.c
-+++ b/include/exec/gen-icount.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void gen_io_end(void)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static inline void gen_tb_start(TranslationBlock *tb)
  {
 -    TCGv_i32 count, imm;
 +    TCGv_i32 count;
      tcg_ctx->exitreq_label = gen_new_label();
      if (tb_cflags(tb) & CF_USE_ICOUNT) {
@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(TranslationBlock *tb)
                     offsetof(ArchCPU, env));
      if (tb_cflags(tb) & CF_USE_ICOUNT) {
 -        imm = tcg_temp_new_i32();
 -        /* We emit a movi with a dummy immediate argument. Keep the insn index
 -         * of the movi so that we later (when we know the actual insn count)
 -         * can update the immediate argument with the actual insn count.  */
 -        tcg_gen_movi_i32(imm, 0xdeadbeef);
 +        /*
 +         * We emit a sub with a dummy immediate argument. Keep the insn index
 +         * of the sub so that we later (when we know the actual insn count)
 +         * can update the argument with the actual insn count.
 +         */
 +        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
          icount_start_insn = tcg_last_op();
 -
 -        tcg_gen_sub_i32(count, count, imm);
 -        tcg_temp_free_i32(imm);
      }
-     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
+     ctx->z_mask = 1;
-@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(TranslationBlock *tb)
++    ctx->s_mask = smask_from_zmask(1);
- static inline void gen_tb_end(TranslationBlock *tb, int num_insns)
+     return false;
- {
+ }
-     if (tb_cflags(tb) & CF_USE_ICOUNT) {
--        /* Update the num_insn immediate parameter now that we know
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 -         * the actual insn count.  */
 -        tcg_set_insn_param(icount_start_insn, 1, num_insns);
 +        /*
 +         * Update the num_insn immediate parameter now that we know
 +         * the actual insn count.
 +         */
 +        tcg_set_insn_param(icount_start_insn, 2,
 +                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
      }
-     gen_set_label(tcg_ctx->exitreq_label);
+     ctx->z_mask = 1;
 +    ctx->s_mask = smask_from_zmask(1);
      return false;
   do_setcond_const:
 --
 .25.1

-[PATCH 31/43] tcg: Remove tcg_gen_dup{8,16,32,64}i_vec
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-These interfaces have been replaced by tcg_gen_dupi_vec
+The results are generally 6 bit unsigned values, though
-and tcg_constant_vec.
+the count leading and trailing bits may produce any value
 for a zero input.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/tcg/tcg-op.h |  4 ----
+ tcg/optimize.c | 3 ++-
- tcg/tcg-op-vec.c     | 20 --------------------
+file changed, 2 insertions(+), 1 deletion(-)
 files changed, 24 deletions(-)
-diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/tcg/tcg-op.h
+--- a/tcg/optimize.c
-+++ b/include/tcg/tcg-op.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
- void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
+         g_assert_not_reached();
- void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
+     }
- void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
--void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
+-
--void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
--void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
+     return false;
 -void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
  void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
  void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
  void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/tcg-op-vec.c
 +++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
      return tcg_const_ones_vec(t->base_type);
  }
--void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
--{
+     default:
--    tcg_gen_dupi_vec(MO_64, r, a);
+         g_assert_not_reached();
--}
+     }
--
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
--void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
+     return false;
--{
+ }
--    tcg_gen_dupi_vec(MO_32, r, a);
 -}
 -
 -void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
 -{
 -    tcg_gen_dupi_vec(MO_16, r, a);
 -}
 -
 -void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
 -{
 -    tcg_gen_dupi_vec(MO_8, r, a);
 -}
 -
  void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
  {
      TCGTemp *rt = tcgv_vec_temp(r);
 --
 .25.1

-[PATCH 19/43] tcg/optimize: Improve find_better_copy
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-Prefer TEMP_CONST over anything else.
+For constant shifts, we can simply shift the s_mask.
+For variable shifts, we know that sar does not reduce
+the s_mask, which helps for sequences like
+    ext32s_i64  t, in
+    sar_i64     t, t, v
+    ext32s_i64  out, t
+allowing the final extend to be eliminated.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/optimize.c | 27 ++++++++++++---------------
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
-file changed, 12 insertions(+), 15 deletions(-)
+file changed, 47 insertions(+), 3 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void init_arg_info(TempOptInfo *infos,
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
+     return ~(~0ull >> rep);
- static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+ }
 +/*
 + * Recreate a properly left-aligned smask after manipulation.
 + * Some bit-shuffling, particularly shifts and rotates, may
 + * retain sign bits on the left, but may scatter disconnected
 + * sign bits on the right.  Retain only what remains to the left.
 + */
 +static uint64_t smask_from_smask(int64_t smask)
 +{
 +    /* Only the 1 bits are significant for smask */
 +    return smask_from_zmask(~smask);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
--    TCGTemp *i;
+     return ts->state_ptr;
-+    TCGTemp *i, *g, *l;
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
--    /* If this is already a global, we can't do better. */
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
--    if (ts->kind >= TEMP_GLOBAL) {
+ {
-+    /* If this is already readonly, we can't do better. */
++    uint64_t s_mask, z_mask, sign;
-+    if (temp_readonly(ts)) {
++
-         return ts;
+     if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
--    /* Search for a global first. */
++    s_mask = arg_info(op->args[1])->s_mask;
-+    g = l = NULL;
++    z_mask = arg_info(op->args[1])->z_mask;
-     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
++
--        if (i->kind >= TEMP_GLOBAL) {
+     if (arg_is_const(op->args[2])) {
-+        if (temp_readonly(i)) {
+-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-             return i;
+-                                          arg_info(op->args[1])->z_mask,
--        }
+-                                          arg_info(op->args[2])->val);
--    }
++        int sh = arg_info(op->args[2])->val;
--
++
--    /* If it is a temp, search for a temp local. */
++        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
--    if (ts->kind == TEMP_NORMAL) {
++
--        for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
++        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
--            if (i->kind >= TEMP_LOCAL) {
++        ctx->s_mask = smask_from_smask(s_mask);
--                return i;
++
-+        } else if (i->kind > ts->kind) {
+         return fold_masks(ctx, op);
 +            if (i->kind == TEMP_GLOBAL) {
 +                g = i;
 +            } else if (i->kind == TEMP_LOCAL) {
 +                l = i;
              }
          }
      }
++
--    /* Failure to find a better representation, return the same temp. */
++    switch (op->opc) {
--    return ts;
++    CASE_OP_32_64(sar):
-+    /* If we didn't find a better representation, return the same temp. */
++        /*
-+    return g ? g : l ? l : ts;
++         * Arithmetic right shift will not reduce the number of
 +         * input sign repetitions.
 +         */
 +        ctx->s_mask = s_mask;
 +        break;
 +    CASE_OP_32_64(shr):
 +        /*
 +         * If the sign bit is known zero, then logical right shift
 +         * will not reduced the number of input sign repetitions.
 +         */
 +        sign = (s_mask & -s_mask) >> 1;
 +        if (!(z_mask & sign)) {
 +            ctx->s_mask = s_mask;
 +        }
 +        break;
 +    default:
 +        break;
 +    }
 +
      return false;
  }
- static bool ts_are_copies(TCGTemp *ts1, TCGTemp *ts2)
 --
 .25.1

This patch collection contains:

* A couple of fixes for i386 host vector support.

* Some random cleanups cherry-picked from some inactive branches.

* A reposting (with fix) of my "better handling of constants" set:

https://lists.nongnu.org/archive/html/qemu-devel/2020-05/msg02152.html

* A couple patches that centralizes the set of host constraints.
    This, I believe is slightly cleaner than the current state of
    afairs, even before the ultimtate goal of pre-validating the
    contents as well.

Richard Henderson (43):
  tcg: Adjust simd_desc size encoding
  tcg: Drop union from TCGArgConstraint
  tcg: Move sorted_args into TCGArgConstraint.sort_index
  tcg: Remove TCG_CT_REG
  tcg: Move some TCG_CT_* bits to TCGArgConstraint bitfields
  tcg: Remove TCGOpDef.used
  tcg/i386: Fix dupi for avx2 32-bit hosts
  tcg: Fix generation of dupi_vec for 32-bit host
  tcg/optimize: Fold dup2_vec
  tcg: Remove TCG_TARGET_HAS_cmp_vec
  tcg: Use tcg_out_dupi_vec from temp_load
  tcg: Increase tcg_out_dupi_vec immediate to int64_t
  tcg: Consolidate 3 bits into enum TCGTempKind
  tcg: Add temp_readonly
  tcg: Expand TCGTemp.val to 64-bits
  tcg: Rename struct tcg_temp_info to TempOptInfo
  tcg: Expand TempOptInfo to 64-bits
  tcg: Introduce TYPE_CONST temporaries
  tcg/optimize: Improve find_better_copy
  tcg/optimize: Adjust TempOptInfo allocation
  tcg/optimize: Use tcg_constant_internal with constant folding
  tcg: Convert tcg_gen_dupi_vec to TCG_CONST
  tcg: Use tcg_constant_i32 with icount expander
  tcg: Use tcg_constant_{i32,i64} with tcg int expanders
  tcg: Use tcg_constant_{i32,i64} with tcg plugins
  tcg: Use tcg_constant_{i32,i64,vec} with gvec expanders
  tcg/tci: Add special tci_movi_{i32,i64} opcodes
  tcg: Remove movi and dupi opcodes
  tcg: Add tcg_reg_alloc_dup2
  tcg/i386: Use tcg_constant_vec with tcg vec expanders
  tcg: Remove tcg_gen_dup{8,16,32,64}i_vec
  tcg/ppc: Use tcg_constant_vec with tcg vec expanders
  tcg/aarch64: Use tcg_constant_vec with tcg vec expanders
  tcg: Add tcg-constr.c.inc
  tcg/i386: Convert to tcg-constr.c.inc
  tcg/aarch64: Convert to tcg-constr.c.inc
  tcg/arm: Convert to tcg-constr.c.inc
  tcg/mips: Convert to tcg-constr.c.inc
  tcg/ppc: Convert to tcg-constr.c.inc
  tcg/riscv: Convert to tcg-constr.c.inc
  tcg/s390: Convert to tcg-constr.c.inc
  tcg/sparc: Convert to tcg-constr.c.inc
  tcg/tci: Convert to tcg-constr.c.inc

include/exec/gen-icount.h       |  25 +-
 include/tcg/tcg-gvec-desc.h     |  38 ++-
 include/tcg/tcg-op.h            |  17 +-
 include/tcg/tcg-opc.h           |  11 +-
 include/tcg/tcg.h               |  72 +++--
 tcg/aarch64/tcg-target-constr.h |  31 ++
 tcg/aarch64/tcg-target.h        |   1 -
 tcg/arm/tcg-target-constr.h     |  30 ++
 tcg/i386/tcg-target-constr.h    |  55 ++++
 tcg/i386/tcg-target.h           |   1 -
 tcg/mips/tcg-target-constr.h    |  31 ++
 tcg/ppc/tcg-target-constr.h     |  37 +++
 tcg/ppc/tcg-target.h            |   1 -
 tcg/riscv/tcg-target-constr.h   |  25 ++
 tcg/s390/tcg-target-constr.h    |  24 ++
 tcg/sparc/tcg-target-constr.h   |  27 ++
 tcg/tci/tcg-target-constr.h     |  28 ++
 accel/tcg/plugin-gen.c          |  49 ++-
 tcg/optimize.c                  | 254 ++++++++-------
 tcg/tcg-op-gvec.c               | 160 +++++-----
 tcg/tcg-op-vec.c                |  48 +--
 tcg/tcg-op.c                    | 227 +++++++------
 tcg/tcg.c                       | 549 +++++++++++++++++++++++---------
 tcg/tci.c                       |   4 +-
 tcg/aarch64/tcg-target.c.inc    | 134 +++-----
 tcg/arm/tcg-target.c.inc        | 123 +++----
 tcg/i386/tcg-target.c.inc       | 336 +++++++++----------
 tcg/mips/tcg-target.c.inc       | 118 +++----
 tcg/ppc/tcg-target.c.inc        | 254 +++++++--------
 tcg/riscv/tcg-target.c.inc      | 100 ++----
 tcg/s390/tcg-target.c.inc       | 143 ++++-----
 tcg/sparc/tcg-target.c.inc      |  97 ++----
 tcg/tcg-constr.c.inc            | 108 +++++++
 tcg/tci/tcg-target.c.inc        | 369 ++++++++-------------
 34 files changed, 1893 insertions(+), 1634 deletions(-)
 create mode 100644 tcg/aarch64/tcg-target-constr.h
 create mode 100644 tcg/arm/tcg-target-constr.h
 create mode 100644 tcg/i386/tcg-target-constr.h
 create mode 100644 tcg/mips/tcg-target-constr.h
 create mode 100644 tcg/ppc/tcg-target-constr.h
 create mode 100644 tcg/riscv/tcg-target-constr.h
 create mode 100644 tcg/s390/tcg-target-constr.h
 create mode 100644 tcg/sparc/tcg-target-constr.h
 create mode 100644 tcg/tci/tcg-target-constr.h
 create mode 100644 tcg/tcg-constr.c.inc

-- 
2.25.1

With larger vector sizes, it turns out oprsz == maxsz, and we only
need to represent mismatch for oprsz <= 32.  We do, however, need
to represent larger oprsz and do so without reducing SIMD_DATA_BITS.

Reduce the size of the oprsz field and increase the maxsz field.
Steal the oprsz value of 24 to indicate equality with maxsz.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-gvec-desc.h | 38 ++++++++++++++++++++++++-------------
 tcg/tcg-op-gvec.c           | 35 ++++++++++++++++++++++++++--------
 2 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/include/tcg/tcg-gvec-desc.h b/include/tcg/tcg-gvec-desc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-gvec-desc.h
+++ b/include/tcg/tcg-gvec-desc.h
@@ -XXX,XX +XXX,XX @@
 #ifndef TCG_TCG_GVEC_DESC_H
 #define TCG_TCG_GVEC_DESC_H
 
-/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
-#define SIMD_OPRSZ_SHIFT   0
-#define SIMD_OPRSZ_BITS    5
+/*
+ * This configuration allows MAXSZ to represent 2048 bytes, and
+ * OPRSZ to match MAXSZ, or represent the smaller values 8, 16, or 32.
+ *
+ * Encode this with:
+ *   0, 1, 3 -> 8, 16, 32
+ *   2       -> maxsz
+ *
+ * This steals the input that would otherwise map to 24 to match maxsz.
+ */
+#define SIMD_MAXSZ_SHIFT   0
+#define SIMD_MAXSZ_BITS    8
 
-#define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
-#define SIMD_MAXSZ_BITS    5
+#define SIMD_OPRSZ_SHIFT   (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
+#define SIMD_OPRSZ_BITS    2
 
-#define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
+#define SIMD_DATA_SHIFT    (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
 #define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
 
 /* Create a descriptor from components.  */
 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
 
-/* Extract the operation size from a descriptor.  */
-static inline intptr_t simd_oprsz(uint32_t desc)
-{
-    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
-}
-
 /* Extract the max vector size from a descriptor.  */
 static inline intptr_t simd_maxsz(uint32_t desc)
 {
-    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
+    return extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) * 8 + 8;
+}
+
+/* Extract the operation size from a descriptor.  */
+static inline intptr_t simd_oprsz(uint32_t desc)
+{
+    uint32_t f = extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS);
+    intptr_t o = f * 8 + 8;
+    intptr_t m = simd_maxsz(desc);
+    return f == 2 ? m : o;
 }
 
 /* Extract the operation-specific data from a descriptor.  */
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ static const TCGOpcode vecop_list_empty[1] = { 0 };
    of the operand offsets so that we can check them all at once.  */
 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
 {
-    uint32_t opr_align = oprsz >= 16 ? 15 : 7;
-    uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
-    tcg_debug_assert(oprsz > 0);
-    tcg_debug_assert(oprsz <= maxsz);
-    tcg_debug_assert((oprsz & opr_align) == 0);
+    uint32_t max_align;
+
+    switch (oprsz) {
+    case 8:
+    case 16:
+    case 32:
+        tcg_debug_assert(oprsz <= maxsz);
+        break;
+    default:
+        tcg_debug_assert(oprsz == maxsz);
+        break;
+    }
+    tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
+
+    max_align = maxsz >= 16 ? 15 : 7;
     tcg_debug_assert((maxsz & max_align) == 0);
     tcg_debug_assert((ofs & max_align) == 0);
 }
@@ -XXX,XX +XXX,XX @@ uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
 {
     uint32_t desc = 0;
 
-    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
-    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
-    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
+    check_size_align(oprsz, maxsz, 0);
+    tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
 
     oprsz = (oprsz / 8) - 1;
     maxsz = (maxsz / 8) - 1;
+
+    /*
+     * We have just asserted in check_size_align that either
+     * oprsz is {8,16,32} or matches maxsz.  Encode the final
+     * case with '2', as that would otherwise map to 24.
+     */
+    if (oprsz == maxsz) {
+        oprsz = 2;
+    }
+
     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
-- 
2.25.1

The union is unused; let "regs" appear in the main structure
without the "u.regs" wrapping.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h            |  4 +---
 tcg/tcg.c                    | 22 +++++++++++-----------
 tcg/aarch64/tcg-target.c.inc | 14 +++++++-------
 tcg/arm/tcg-target.c.inc     | 26 +++++++++++++-------------
 tcg/i386/tcg-target.c.inc    | 26 +++++++++++++-------------
 tcg/mips/tcg-target.c.inc    | 18 +++++++++---------
 tcg/ppc/tcg-target.c.inc     | 24 ++++++++++++------------
 tcg/riscv/tcg-target.c.inc   | 14 +++++++-------
 tcg/s390/tcg-target.c.inc    | 18 +++++++++---------
 tcg/sparc/tcg-target.c.inc   | 16 ++++++++--------
 tcg/tci/tcg-target.c.inc     |  2 +-
 11 files changed, 91 insertions(+), 93 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(void);
 typedef struct TCGArgConstraint {
     uint16_t ct;
     uint8_t alias_index;
-    union {
-        TCGRegSet regs;
-    } u;
+    TCGRegSet regs;
 } TCGArgConstraint;
 
 #define TCG_MAX_OP_ARGS 16
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static int get_constraint_priority(const TCGOpDef *def, int k)
             return 0;
         n = 0;
         for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
-            if (tcg_regset_test_reg(arg_ct->u.regs, i))
+            if (tcg_regset_test_reg(arg_ct->regs, i))
                 n++;
         }
     }
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
             /* Incomplete TCGTargetOpDef entry. */
             tcg_debug_assert(ct_str != NULL);
 
-            def->args_ct[i].u.regs = 0;
+            def->args_ct[i].regs = 0;
             def->args_ct[i].ct = 0;
             while (*ct_str != '\0') {
                 switch(*ct_str) {
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
                     pset = la_temp_pref(ts);
                     set = *pset;
 
-                    set &= ct->u.regs;
+                    set &= ct->regs;
                     if (ct->ct & TCG_CT_IALIAS) {
                         set &= op->output_pref[ct->alias_index];
                     }
                     /* If the combination is not possible, restart.  */
                     if (set == 0) {
-                        set = ct->u.regs;
+                        set = ct->regs;
                     }
                     *pset = set;
                 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
         return;
     }
 
-    dup_out_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[0].u.regs;
-    dup_in_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[1].u.regs;
+    dup_out_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
+    dup_in_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[1].regs;
 
     /* Allocate the output register now.  */
     if (ots->val_type != TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             }
         }
 
-        temp_load(s, ts, arg_ct->u.regs, i_allocated_regs, i_preferred_regs);
+        temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
         reg = ts->reg;
 
-        if (tcg_regset_test_reg(arg_ct->u.regs, reg)) {
+        if (tcg_regset_test_reg(arg_ct->regs, reg)) {
             /* nothing to do : the constraint is satisfied */
         } else {
         allocate_in_reg:
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                and move the temporary register into it */
             temp_load(s, ts, tcg_target_available_regs[ts->type],
                       i_allocated_regs, 0);
-            reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
+            reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
                                 o_preferred_regs, ts->indirect_base);
             if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
                 /*
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                 && !const_args[arg_ct->alias_index]) {
                 reg = new_args[arg_ct->alias_index];
             } else if (arg_ct->ct & TCG_CT_NEWREG) {
-                reg = tcg_reg_alloc(s, arg_ct->u.regs,
+                reg = tcg_reg_alloc(s, arg_ct->regs,
                                     i_allocated_regs | o_allocated_regs,
                                     op->output_pref[k], ts->indirect_base);
             } else {
-                reg = tcg_reg_alloc(s, arg_ct->u.regs, o_allocated_regs,
+                reg = tcg_reg_alloc(s, arg_ct->regs, o_allocated_regs,
                                     op->output_pref[k], ts->indirect_base);
             }
             tcg_regset_set_reg(o_allocated_regs, reg);
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch (*ct_str++) {
     case 'r': /* general registers */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs |= 0xffffffffu;
+        ct->regs |= 0xffffffffu;
         break;
     case 'w': /* advsimd registers */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs |= 0xffffffff00000000ull;
+        ct->regs |= 0xffffffff00000000ull;
         break;
     case 'l': /* qemu_ld / qemu_st address, data_reg */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffffu;
+        ct->regs = 0xffffffffu;
 #ifdef CONFIG_SOFTMMU
         /* x0 and x1 will be overwritten when reading the tlb entry,
            and x2, and x3 for helper args, better to avoid using them. */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X0);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X1);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X2);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_X3);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_X0);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_X1);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_X2);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_X3);
 #endif
         break;
     case 'A': /* Valid for arithmetic immediate (positive or negative).  */
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 
     case 'r':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffff;
+        ct->regs = 0xffff;
         break;
 
     /* qemu_ld address */
     case 'l':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffff;
+        ct->regs = 0xffff;
 #ifdef CONFIG_SOFTMMU
         /* r0-r2,lr will be overwritten when reading the tlb entry,
            so don't use these. */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R0);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R1);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R14);
 #endif
         break;
 
     /* qemu_st address & data */
     case 's':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffff;
+        ct->regs = 0xffff;
         /* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
            and r0-r1 doing the byte swapping, so don't use these. */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R0);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R1);
 #if defined(CONFIG_SOFTMMU)
         /* Avoid clashes with registers being used for helper args */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
 #if TARGET_LONG_BITS == 64
         /* Avoid clashes with registers being used for helper args */
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
 #endif
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R14);
 #endif
         break;
 
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch(*ct_str++) {
     case 'a':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
+        tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
         break;
     case 'b':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
+        tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
         break;
     case 'c':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
+        tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
         break;
     case 'd':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
+        tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
         break;
     case 'S':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
+        tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
         break;
     case 'D':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
+        tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
         break;
     case 'q':
         /* A register that can be used as a byte operand.  */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
+        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
         break;
     case 'Q':
         /* A register with an addressable second byte (e.g. %ah).  */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xf;
+        ct->regs = 0xf;
         break;
     case 'r':
         /* A general register.  */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs |= ALL_GENERAL_REGS;
+        ct->regs |= ALL_GENERAL_REGS;
         break;
     case 'W':
         /* With TZCNT/LZCNT, we can have operand-size as an input.  */
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     case 'x':
         /* A vector register.  */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs |= ALL_VECTOR_REGS;
+        ct->regs |= ALL_VECTOR_REGS;
         break;
 
         /* qemu_ld/st address constraint */
     case 'L':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
+        ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
+        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
         break;
 
     case 'e':
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch(*ct_str++) {
     case 'r':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
+        ct->regs = 0xffffffff;
         break;
     case 'L': /* qemu_ld input arg constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
+        ct->regs = 0xffffffff;
+        tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
         if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
+            tcg_regset_reset_reg(ct->regs, TCG_REG_A2);
         }
 #endif
         break;
     case 'S': /* qemu_st constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
+        ct->regs = 0xffffffff;
+        tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
         if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A2);
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A3);
+            tcg_regset_reset_reg(ct->regs, TCG_REG_A2);
+            tcg_regset_reset_reg(ct->regs, TCG_REG_A3);
         } else {
-            tcg_regset_reset_reg(ct->u.regs, TCG_REG_A1);
+            tcg_regset_reset_reg(ct->regs, TCG_REG_A1);
         }
 #endif
         break;
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch (*ct_str++) {
     case 'A': case 'B': case 'C': case 'D':
         ct->ct |= TCG_CT_REG;
-        tcg_regset_set_reg(ct->u.regs, 3 + ct_str[0] - 'A');
+        tcg_regset_set_reg(ct->regs, 3 + ct_str[0] - 'A');
         break;
     case 'r':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
+        ct->regs = 0xffffffff;
         break;
     case 'v':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff00000000ull;
+        ct->regs = 0xffffffff00000000ull;
         break;
     case 'L':                   /* qemu_ld constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
+        ct->regs = 0xffffffff;
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
 #ifdef CONFIG_SOFTMMU
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R5);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R5);
 #endif
         break;
     case 'S':                   /* qemu_st constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
+        ct->regs = 0xffffffff;
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
 #ifdef CONFIG_SOFTMMU
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R5);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R6);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R5);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R6);
 #endif
         break;
     case 'I':
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch (*ct_str++) {
     case 'r':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
+        ct->regs = 0xffffffff;
         break;
     case 'L':
         /* qemu_ld/qemu_st constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
+        ct->regs = 0xffffffff;
         /* qemu_ld/qemu_st uses TCG_REG_TMP0 */
 #if defined(CONFIG_SOFTMMU)
-        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[0]);
-        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[1]);
-        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[2]);
-        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[3]);
-        tcg_regset_reset_reg(ct->u.regs, tcg_target_call_iarg_regs[4]);
+        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[0]);
+        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[1]);
+        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[2]);
+        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[3]);
+        tcg_regset_reset_reg(ct->regs, tcg_target_call_iarg_regs[4]);
 #endif
         break;
     case 'I':
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch (*ct_str++) {
     case 'r':                  /* all registers */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffff;
+        ct->regs = 0xffff;
         break;
     case 'L':                  /* qemu_ld/st constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffff;
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_R4);
+        ct->regs = 0xffff;
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
         break;
     case 'a':                  /* force R2 for division */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_R2);
+        ct->regs = 0;
+        tcg_regset_set_reg(ct->regs, TCG_REG_R2);
         break;
     case 'b':                  /* force R3 for division */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0;
-        tcg_regset_set_reg(ct->u.regs, TCG_REG_R3);
+        ct->regs = 0;
+        tcg_regset_set_reg(ct->regs, TCG_REG_R3);
         break;
     case 'A':
         ct->ct |= TCG_CT_CONST_S33;
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     switch (*ct_str++) {
     case 'r':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
+        ct->regs = 0xffffffff;
         break;
     case 'R':
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = ALL_64;
+        ct->regs = ALL_64;
         break;
     case 'A': /* qemu_ld/st address constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = TARGET_LONG_BITS == 64 ? ALL_64 : 0xffffffff;
+        ct->regs = TARGET_LONG_BITS == 64 ? ALL_64 : 0xffffffff;
     reserve_helpers:
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O0);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O1);
-        tcg_regset_reset_reg(ct->u.regs, TCG_REG_O2);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_O0);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_O1);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_O2);
         break;
     case 's': /* qemu_st data 32-bit constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = 0xffffffff;
+        ct->regs = 0xffffffff;
         goto reserve_helpers;
     case 'S': /* qemu_st data 64-bit constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = ALL_64;
+        ct->regs = ALL_64;
         goto reserve_helpers;
     case 'I':
         ct->ct |= TCG_CT_CONST_S11;
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     case 'L':                   /* qemu_ld constraint */
     case 'S':                   /* qemu_st constraint */
         ct->ct |= TCG_CT_REG;
-        ct->u.regs = BIT(TCG_TARGET_NB_REGS) - 1;
+        ct->regs = BIT(TCG_TARGET_NB_REGS) - 1;
         break;
     default:
         return NULL;
-- 
2.25.1

This uses an existing hole in the TCGArgConstraint structure
and will be convenient for keeping the data in one place.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  2 +-
 tcg/tcg.c         | 35 +++++++++++++++++------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

This wasn't actually used for anything, really.  All variable
operands must accept registers, and which are indicated by the
set in TCGArgConstraint.regs.

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ void tcg_dump_op_count(void);
 #define TCG_CT_ALIAS  0x80
 #define TCG_CT_IALIAS 0x40
 #define TCG_CT_NEWREG 0x20 /* output requires a new register */
-#define TCG_CT_REG    0x01
 #define TCG_CT_CONST  0x02 /* any constant of register size */
 
 typedef struct TCGArgConstraint {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_dump_ops(TCGContext *s, bool have_prefs)
 /* we give more priority to constraints with less registers */
 static int get_constraint_priority(const TCGOpDef *def, int k)
 {
-    const TCGArgConstraint *arg_ct;
+    const TCGArgConstraint *arg_ct = &def->args_ct[k];
+    int n;
 
-    int i, n;
-    arg_ct = &def->args_ct[k];
     if (arg_ct->ct & TCG_CT_ALIAS) {
         /* an alias is equivalent to a single register */
         n = 1;
     } else {
-        if (!(arg_ct->ct & TCG_CT_REG))
-            return 0;
-        n = 0;
-        for(i = 0; i < TCG_TARGET_NB_REGS; i++) {
-            if (tcg_regset_test_reg(arg_ct->regs, i))
-                n++;
-        }
+        n = ctpop64(arg_ct->regs);
     }
     return TCG_TARGET_NB_REGS - n + 1;
 }
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
                         int oarg = *ct_str - '0';
                         tcg_debug_assert(ct_str == tdefs->args_ct_str[i]);
                         tcg_debug_assert(oarg < def->nb_oargs);
-                        tcg_debug_assert(def->args_ct[oarg].ct & TCG_CT_REG);
+                        tcg_debug_assert(def->args_ct[oarg].regs != 0);
                         /* TCG_CT_ALIAS is for the output arguments.
                            The input is tagged with TCG_CT_IALIAS. */
                         def->args_ct[i] = def->args_ct[oarg];
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch (*ct_str++) {
     case 'r': /* general registers */
-        ct->ct |= TCG_CT_REG;
         ct->regs |= 0xffffffffu;
         break;
     case 'w': /* advsimd registers */
-        ct->ct |= TCG_CT_REG;
         ct->regs |= 0xffffffff00000000ull;
         break;
     case 'l': /* qemu_ld / qemu_st address, data_reg */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffffu;
 #ifdef CONFIG_SOFTMMU
         /* x0 and x1 will be overwritten when reading the tlb entry,
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         break;
 
     case 'r':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffff;
         break;
 
     /* qemu_ld address */
     case 'l':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffff;
 #ifdef CONFIG_SOFTMMU
         /* r0-r2,lr will be overwritten when reading the tlb entry,
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 
     /* qemu_st address & data */
     case 's':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffff;
         /* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
            and r0-r1 doing the byte swapping, so don't use these. */
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch(*ct_str++) {
     case 'a':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, TCG_REG_EAX);
         break;
     case 'b':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, TCG_REG_EBX);
         break;
     case 'c':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, TCG_REG_ECX);
         break;
     case 'd':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, TCG_REG_EDX);
         break;
     case 'S':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, TCG_REG_ESI);
         break;
     case 'D':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, TCG_REG_EDI);
         break;
     case 'q':
         /* A register that can be used as a byte operand.  */
-        ct->ct |= TCG_CT_REG;
         ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
         break;
     case 'Q':
         /* A register with an addressable second byte (e.g. %ah).  */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xf;
         break;
     case 'r':
         /* A general register.  */
-        ct->ct |= TCG_CT_REG;
         ct->regs |= ALL_GENERAL_REGS;
         break;
     case 'W':
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         break;
     case 'x':
         /* A vector register.  */
-        ct->ct |= TCG_CT_REG;
         ct->regs |= ALL_VECTOR_REGS;
         break;
 
         /* qemu_ld/st address constraint */
     case 'L':
-        ct->ct |= TCG_CT_REG;
         ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
         tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
         tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch(*ct_str++) {
     case 'r':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         break;
     case 'L': /* qemu_ld input arg constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 #endif
         break;
     case 'S': /* qemu_st constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         tcg_regset_reset_reg(ct->regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch (*ct_str++) {
     case 'A': case 'B': case 'C': case 'D':
-        ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->regs, 3 + ct_str[0] - 'A');
         break;
     case 'r':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         break;
     case 'v':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff00000000ull;
         break;
     case 'L':                   /* qemu_ld constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
 #ifdef CONFIG_SOFTMMU
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 #endif
         break;
     case 'S':                   /* qemu_st constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
 #ifdef CONFIG_SOFTMMU
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch (*ct_str++) {
     case 'r':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         break;
     case 'L':
         /* qemu_ld/qemu_st constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         /* qemu_ld/qemu_st uses TCG_REG_TMP0 */
 #if defined(CONFIG_SOFTMMU)
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch (*ct_str++) {
     case 'r':                  /* all registers */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffff;
         break;
     case 'L':                  /* qemu_ld/st constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffff;
         tcg_regset_reset_reg(ct->regs, TCG_REG_R2);
         tcg_regset_reset_reg(ct->regs, TCG_REG_R3);
         tcg_regset_reset_reg(ct->regs, TCG_REG_R4);
         break;
     case 'a':                  /* force R2 for division */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0;
         tcg_regset_set_reg(ct->regs, TCG_REG_R2);
         break;
     case 'b':                  /* force R3 for division */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0;
         tcg_regset_set_reg(ct->regs, TCG_REG_R3);
         break;
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
 {
     switch (*ct_str++) {
     case 'r':
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         break;
     case 'R':
-        ct->ct |= TCG_CT_REG;
         ct->regs = ALL_64;
         break;
     case 'A': /* qemu_ld/st address constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = TARGET_LONG_BITS == 64 ? ALL_64 : 0xffffffff;
     reserve_helpers:
         tcg_regset_reset_reg(ct->regs, TCG_REG_O0);
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         tcg_regset_reset_reg(ct->regs, TCG_REG_O2);
         break;
     case 's': /* qemu_st data 32-bit constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = 0xffffffff;
         goto reserve_helpers;
     case 'S': /* qemu_st data 64-bit constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = ALL_64;
         goto reserve_helpers;
     case 'I':
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
     case 'r':
     case 'L':                   /* qemu_ld constraint */
     case 'S':                   /* qemu_st constraint */
-        ct->ct |= TCG_CT_REG;
         ct->regs = BIT(TCG_TARGET_NB_REGS) - 1;
         break;
     default:
-- 
2.25.1

These are easier to set and test when they have their own fields.
Reduce the size of alias_index and sort_index to 4 bits, which is
sufficient for TCG_MAX_OP_ARGS.  This leaves only the bits indicating
constants within the ct field.

Move all initialization to allocation time, rather than init
individual fields in process_op_defs.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 14 +++++++-------
 tcg/tcg.c         | 28 ++++++++++++----------------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ int64_t tcg_cpu_exec_time(void);
 void tcg_dump_info(void);
 void tcg_dump_op_count(void);
 
-#define TCG_CT_ALIAS  0x80
-#define TCG_CT_IALIAS 0x40
-#define TCG_CT_NEWREG 0x20 /* output requires a new register */
-#define TCG_CT_CONST  0x02 /* any constant of register size */
+#define TCG_CT_CONST  1 /* any constant of register size */
 
 typedef struct TCGArgConstraint {
-    uint16_t ct;
-    uint8_t alias_index;
-    uint8_t sort_index;
+    unsigned ct : 16;
+    unsigned alias_index : 4;
+    unsigned sort_index : 4;
+    bool oalias : 1;
+    bool ialias : 1;
+    bool newreg : 1;
     TCGRegSet regs;
 } TCGArgConstraint;
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ void tcg_context_init(TCGContext *s)
         total_args += n;
     }
 
-    args_ct = g_malloc(sizeof(TCGArgConstraint) * total_args);
+    args_ct = g_new0(TCGArgConstraint, total_args);
 
     for(op = 0; op < NB_OPS; op++) {
         def = &tcg_op_defs[op];
@@ -XXX,XX +XXX,XX @@ static int get_constraint_priority(const TCGOpDef *def, int k)
     const TCGArgConstraint *arg_ct = &def->args_ct[k];
     int n;
 
-    if (arg_ct->ct & TCG_CT_ALIAS) {
+    if (arg_ct->oalias) {
         /* an alias is equivalent to a single register */
         n = 1;
     } else {
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
             /* Incomplete TCGTargetOpDef entry. */
             tcg_debug_assert(ct_str != NULL);
 
-            def->args_ct[i].regs = 0;
-            def->args_ct[i].ct = 0;
             while (*ct_str != '\0') {
                 switch(*ct_str) {
                 case '0' ... '9':
@@ -XXX,XX +XXX,XX @@ static void process_op_defs(TCGContext *s)
                         tcg_debug_assert(ct_str == tdefs->args_ct_str[i]);
                         tcg_debug_assert(oarg < def->nb_oargs);
                         tcg_debug_assert(def->args_ct[oarg].regs != 0);
-                        /* TCG_CT_ALIAS is for the output arguments.
-                           The input is tagged with TCG_CT_IALIAS. */
                         def->args_ct[i] = def->args_ct[oarg];
-                        def->args_ct[oarg].ct |= TCG_CT_ALIAS;
+                        /* The output sets oalias.  */
+                        def->args_ct[oarg].oalias = true;
                         def->args_ct[oarg].alias_index = i;
-                        def->args_ct[i].ct |= TCG_CT_IALIAS;
+                        /* The input sets ialias. */
+                        def->args_ct[i].ialias = true;
                         def->args_ct[i].alias_index = oarg;
                     }
                     ct_str++;
                     break;
                 case '&':
-                    def->args_ct[i].ct |= TCG_CT_NEWREG;
+                    def->args_ct[i].newreg = true;
                     ct_str++;
                     break;
                 case 'i':
@@ -XXX,XX +XXX,XX @@ static void liveness_pass_1(TCGContext *s)
                     set = *pset;
 
                     set &= ct->regs;
-                    if (ct->ct & TCG_CT_IALIAS) {
+                    if (ct->ialias) {
                         set &= op->output_pref[ct->alias_index];
                     }
                     /* If the combination is not possible, restart.  */
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         }
 
         i_preferred_regs = o_preferred_regs = 0;
-        if (arg_ct->ct & TCG_CT_IALIAS) {
+        if (arg_ct->ialias) {
             o_preferred_regs = op->output_pref[arg_ct->alias_index];
             if (ts->fixed_reg) {
                 /* if fixed register, we must allocate a new register
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                     reg = ts->reg;
                     for (k2 = 0 ; k2 < k ; k2++) {
                         i2 = def->args_ct[nb_oargs + k2].sort_index;
-                        if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
-                            reg == new_args[i2]) {
+                        if (def->args_ct[i2].ialias && reg == new_args[i2]) {
                             goto allocate_in_reg;
                         }
                     }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             /* ENV should not be modified.  */
             tcg_debug_assert(!ts->fixed_reg);
 
-            if ((arg_ct->ct & TCG_CT_ALIAS)
-                && !const_args[arg_ct->alias_index]) {
+            if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                 reg = new_args[arg_ct->alias_index];
-            } else if (arg_ct->ct & TCG_CT_NEWREG) {
+            } else if (arg_ct->newreg) {
                 reg = tcg_reg_alloc(s, arg_ct->regs,
                                     i_allocated_regs | o_allocated_regs,
                                     op->output_pref[k], ts->indirect_base);
-- 
2.25.1

The definition of INDEX_op_dupi_vec is that it operates on
units of tcg_target_ulong -- in this case 32 bits.  It does
not work to use this for a uint64_t value that happens to be
small enough to fit in tcg_target_ulong.

Fixes: d2fd745fe8b
Fixes: db432672dc5
Cc: qemu-stable@nongnu.org
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-vec.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
 
 void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
 {
-    if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
-        do_dupi_vec(r, MO_32, a);
-    } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
+    if (TCG_TARGET_REG_BITS == 64) {
         do_dupi_vec(r, MO_64, a);
+    } else if (a == dup_const(MO_32, a)) {
+        do_dupi_vec(r, MO_32, a);
     } else {
         TCGv_i64 c = tcg_const_i64(a);
         tcg_gen_dup_i64_vec(MO_64, r, c);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
 
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(vece, a));
+    if (vece == MO_64) {
+        tcg_gen_dup64i_vec(r, a);
+    } else {
+        do_dupi_vec(r, MO_REG, dup_const(vece, a));
+    }
 }
 
 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
-- 
2.25.1

When the two arguments are identical, this can be reduced to
dup_vec or to mov_vec from a tcg_constant_vec.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             goto do_default;
 
+        case INDEX_op_dup2_vec:
+            assert(TCG_TARGET_REG_BITS == 32);
+            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+                tmp = arg_info(op->args[1])->val;
+                if (tmp == arg_info(op->args[2])->val) {
+                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                    break;
+                }
+            } else if (args_are_copies(op->args[1], op->args[2])) {
+                op->opc = INDEX_op_dup_vec;
+                TCGOP_VECE(op) = MO_32;
+                nb_iargs = 1;
+            }
+            goto do_default;
+
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
         CASE_OP_32_64(ext8s):
-- 
2.25.1

The cmp_vec opcode is mandatory; this symbol is unused.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h | 1 -
 tcg/i386/tcg-target.h    | 1 -
 tcg/ppc/tcg-target.h     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -XXX,XX +XXX,XX @@ typedef enum {
 #define TCG_TARGET_HAS_shi_vec          1
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          1
-#define TCG_TARGET_HAS_cmp_vec          1
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       1
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_avx2;
 #define TCG_TARGET_HAS_shi_vec          1
 #define TCG_TARGET_HAS_shs_vec          1
 #define TCG_TARGET_HAS_shv_vec          have_avx2
-#define TCG_TARGET_HAS_cmp_vec          1
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       1
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -XXX,XX +XXX,XX @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_shi_vec          0
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          1
-#define TCG_TARGET_HAS_cmp_vec          1
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       1
-- 
2.25.1

Having dupi pass though movi is confusing and arguably wrong.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                    |  6 +++-
 tcg/aarch64/tcg-target.c.inc |  7 ----
 tcg/i386/tcg-target.c.inc    | 63 ++++++++++++++++++++++++------------
 tcg/ppc/tcg-target.c.inc     |  6 ----
 4 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
     case TEMP_VAL_CONST:
         reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
                             preferred_regs, ts->indirect_base);
-        tcg_out_movi(s, ts->type, reg, ts->val);
+        if (ts->type <= TCG_TYPE_I64) {
+            tcg_out_movi(s, ts->type, reg, ts->val);
+        } else {
+            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
+        }
         ts->mem_coherent = 0;
         break;
     case TEMP_VAL_MEM:
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     case TCG_TYPE_I64:
         tcg_debug_assert(rd < 32);
         break;
-
-    case TCG_TYPE_V64:
-    case TCG_TYPE_V128:
-        tcg_debug_assert(rd >= 32);
-        tcg_out_dupi_vec(s, type, rd, value);
-        return;
-
     default:
         g_assert_not_reached();
     }
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
     }
 }
 
-static void tcg_out_movi(TCGContext *s, TCGType type,
-                         TCGReg ret, tcg_target_long arg)
+static void tcg_out_movi_vec(TCGContext *s, TCGType type,
+                             TCGReg ret, tcg_target_long arg)
+{
+    if (arg == 0) {
+        tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
+        return;
+    }
+    if (arg == -1) {
+        tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
+        return;
+    }
+
+    int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
+    tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
+    if (TCG_TARGET_REG_BITS == 64) {
+        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
+    } else {
+        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+    }
+}
+
+static void tcg_out_movi_int(TCGContext *s, TCGType type,
+                             TCGReg ret, tcg_target_long arg)
 {
     tcg_target_long diff;
 
-    switch (type) {
-    case TCG_TYPE_I32:
-#if TCG_TARGET_REG_BITS == 64
-    case TCG_TYPE_I64:
-#endif
-        if (ret < 16) {
-            break;
-        }
-        /* fallthru */
-    case TCG_TYPE_V64:
-    case TCG_TYPE_V128:
-    case TCG_TYPE_V256:
-        tcg_debug_assert(ret >= 16);
-        tcg_out_dupi_vec(s, type, ret, arg);
-        return;
-    default:
-        g_assert_not_reached();
-    }
-
     if (arg == 0) {
         tgen_arithr(s, ARITH_XOR, ret, ret);
         return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     tcg_out64(s, arg);
 }
 
+static void tcg_out_movi(TCGContext *s, TCGType type,
+                         TCGReg ret, tcg_target_long arg)
+{
+    switch (type) {
+    case TCG_TYPE_I32:
+#if TCG_TARGET_REG_BITS == 64
+    case TCG_TYPE_I64:
+#endif
+        if (ret < 16) {
+            tcg_out_movi_int(s, type, ret, arg);
+        } else {
+            tcg_out_movi_vec(s, type, ret, arg);
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
 {
     if (val == (int8_t)val) {
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
         tcg_out_movi_int(s, type, ret, arg, false);
         break;
 
-    case TCG_TYPE_V64:
-    case TCG_TYPE_V128:
-        tcg_debug_assert(ret >= TCG_REG_V0);
-        tcg_out_dupi_vec(s, type, ret, arg);
-        break;
-
     default:
         g_assert_not_reached();
     }
-- 
2.25.1

While we don't store more than tcg_target_long in TCGTemp,
we shouldn't be limited to that for code generation.  We will
be able to use this for INDEX_op_dup2_vec with 2 constants.

Also pass along the minimal vece that may be said to apply
to the constant.  This allows some simplification in the
various backends.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c                    | 31 +++++++++++++++++++++++++-----
 tcg/aarch64/tcg-target.c.inc | 12 ++++++------
 tcg/i386/tcg-target.c.inc    | 22 ++++++++++++---------
 tcg/ppc/tcg-target.c.inc     | 37 +++++++++++++++++++++++-------------
 4 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg dst, TCGReg src);
 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
                              TCGReg dst, TCGReg base, intptr_t offset);
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                             TCGReg dst, tcg_target_long arg);
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg dst, int64_t arg);
 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
                            unsigned vece, const TCGArg *args,
                            const int *const_args);
@@ -XXX,XX +XXX,XX @@ static inline bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 {
     g_assert_not_reached();
 }
-static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                                    TCGReg dst, tcg_target_long arg)
+static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                                    TCGReg dst, int64_t arg)
 {
     g_assert_not_reached();
 }
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
         if (ts->type <= TCG_TYPE_I64) {
             tcg_out_movi(s, ts->type, reg, ts->val);
         } else {
-            tcg_out_dupi_vec(s, ts->type, reg, ts->val);
+            uint64_t val = ts->val;
+            MemOp vece = MO_64;
+
+            /*
+             * Find the minimal vector element that matches the constant.
+             * The targets will, in general, have to do this search anyway,
+             * do this generically.
+             */
+            if (TCG_TARGET_REG_BITS == 32) {
+                val = dup_const(MO_32, val);
+                vece = MO_32;
+            }
+            if (val == dup_const(MO_8, val)) {
+                vece = MO_8;
+            } else if (val == dup_const(MO_16, val)) {
+                vece = MO_16;
+            } else if (TCG_TARGET_REG_BITS == 64 &&
+                       val == dup_const(MO_32, val)) {
+                vece = MO_32;
+            }
+
+            tcg_out_dupi_vec(s, ts->type, vece, reg, ts->val);
         }
         ts->mem_coherent = 0;
         break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
     tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
 }
 
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                             TCGReg rd, tcg_target_long v64)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, int64_t v64)
 {
     bool q = type == TCG_TYPE_V128;
     int cmode, imm8, i;
 
     /* Test all bytes equal first.  */
-    if (v64 == dup_const(MO_8, v64)) {
+    if (vece == MO_8) {
         imm8 = (uint8_t)v64;
         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
         return;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
      * cannot find an expansion there's no point checking a larger
      * width because we already know by replication it cannot match.
      */
-    if (v64 == dup_const(MO_16, v64)) {
+    if (vece == MO_16) {
         uint16_t v16 = v64;
 
         if (is_shimm16(v16, &cmode, &imm8)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
         tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
         return;
-    } else if (v64 == dup_const(MO_32, v64)) {
+    } else if (vece == MO_32) {
         uint32_t v32 = v64;
         uint32_t n32 = ~v32;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                         tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
                         break;
                     }
-                    tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0);
+                    tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                     a2 = TCG_VEC_TMP;
                 }
                 insn = cmp_insn[cond];
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
     return true;
 }
 
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
-                             TCGReg ret, tcg_target_long arg)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg ret, int64_t arg)
 {
     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         return;
     }
 
-    if (TCG_TARGET_REG_BITS == 64) {
+    if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
+        if (have_avx2) {
+            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
+        } else {
+            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
+        }
+        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+    } else {
         if (type == TCG_TYPE_V64) {
             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
         } else if (have_avx2) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
         } else {
             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
         }
-        new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
-    } else {
-        if (have_avx2) {
-            tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
+        if (TCG_TARGET_REG_BITS == 64) {
+            new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
         } else {
-            tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
+            new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
         }
-        new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
     }
 }
 
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 }
 
-static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
-                             tcg_target_long val)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg ret, int64_t val)
 {
     uint32_t load_insn;
     int rel, low;
     intptr_t add;
 
-    low = (int8_t)val;
-    if (low >= -16 && low < 16) {
-        if (val == (tcg_target_long)dup_const(MO_8, low)) {
+    switch (vece) {
+    case MO_8:
+        low = (int8_t)val;
+        if (low >= -16 && low < 16) {
             tcg_out32(s, VSPLTISB | VRT(ret) | ((val & 31) << 16));
             return;
         }
-        if (val == (tcg_target_long)dup_const(MO_16, low)) {
+        if (have_isa_3_00) {
+            tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
+            return;
+        }
+        break;
+
+    case MO_16:
+        low = (int16_t)val;
+        if (low >= -16 && low < 16) {
             tcg_out32(s, VSPLTISH | VRT(ret) | ((val & 31) << 16));
             return;
         }
-        if (val == (tcg_target_long)dup_const(MO_32, low)) {
+        break;
+
+    case MO_32:
+        low = (int32_t)val;
+        if (low >= -16 && low < 16) {
             tcg_out32(s, VSPLTISW | VRT(ret) | ((val & 31) << 16));
             return;
         }
-    }
-    if (have_isa_3_00 && val == (tcg_target_long)dup_const(MO_8, val)) {
-        tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11));
-        return;
+        break;
     }
 
     /*
@@ -XXX,XX +XXX,XX @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
         if (TCG_TARGET_REG_BITS == 64) {
             new_pool_label(s, val, rel, s->code_ptr, add);
         } else {
-            new_pool_l2(s, rel, s->code_ptr, add, val, val);
+            new_pool_l2(s, rel, s->code_ptr, add, val >> 32, val);
         }
     } else {
         load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
         if (TCG_TARGET_REG_BITS == 64) {
             new_pool_l2(s, rel, s->code_ptr, add, val, val);
         } else {
-            new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
+            new_pool_l4(s, rel, s->code_ptr, add,
+                        val >> 32, val, val >> 32, val);
         }
     }
 
-- 
2.25.1

The temp_fixed, temp_global, temp_local bits are all related.
Combine them into a single enumeration.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  20 +++++---
 tcg/optimize.c    |   8 +--
 tcg/tcg.c         | 122 ++++++++++++++++++++++++++++------------------
 3 files changed, 90 insertions(+), 60 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempVal {
     TEMP_VAL_CONST,
 } TCGTempVal;
 
+typedef enum TCGTempKind {
+    /* Temp is dead at the end of all basic blocks. */
+    TEMP_NORMAL,
+    /* Temp is saved across basic blocks but dead at the end of TBs. */
+    TEMP_LOCAL,
+    /* Temp is saved across both basic blocks and translation blocks. */
+    TEMP_GLOBAL,
+    /* Temp is in a fixed register. */
+    TEMP_FIXED,
+} TCGTempKind;
+
 typedef struct TCGTemp {
     TCGReg reg:8;
     TCGTempVal val_type:8;
     TCGType base_type:8;
     TCGType type:8;
-    unsigned int fixed_reg:1;
+    TCGTempKind kind:3;
     unsigned int indirect_reg:1;
     unsigned int indirect_base:1;
     unsigned int mem_coherent:1;
     unsigned int mem_allocated:1;
-    /* If true, the temp is saved across both basic blocks and
-       translation blocks.  */
-    unsigned int temp_global:1;
-    /* If true, the temp is saved across basic blocks but dead
-       at the end of translation blocks.  If false, the temp is
-       dead at the end of basic blocks.  */
-    unsigned int temp_local:1;
     unsigned int temp_allocated:1;
 
     tcg_target_long val;
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
     TCGTemp *i;
 
     /* If this is already a global, we can't do better. */
-    if (ts->temp_global) {
+    if (ts->kind >= TEMP_GLOBAL) {
         return ts;
     }
 
     /* Search for a global first. */
     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-        if (i->temp_global) {
+        if (i->kind >= TEMP_GLOBAL) {
             return i;
         }
     }
 
     /* If it is a temp, search for a temp local. */
-    if (!ts->temp_local) {
+    if (ts->kind == TEMP_NORMAL) {
         for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-            if (ts->temp_local) {
+            if (i->kind >= TEMP_LOCAL) {
                 return i;
             }
         }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static inline TCGTemp *tcg_global_alloc(TCGContext *s)
     tcg_debug_assert(s->nb_globals == s->nb_temps);
     s->nb_globals++;
     ts = tcg_temp_alloc(s);
-    ts->temp_global = 1;
+    ts->kind = TEMP_GLOBAL;
 
     return ts;
 }
@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_global_reg_new_internal(TCGContext *s, TCGType type,
     ts = tcg_global_alloc(s);
     ts->base_type = type;
     ts->type = type;
-    ts->fixed_reg = 1;
+    ts->kind = TEMP_FIXED;
     ts->reg = reg;
     ts->name = name;
     tcg_regset_set_reg(s->reserved_regs, reg);
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
     bigendian = 1;
 #endif
 
-    if (!base_ts->fixed_reg) {
+    if (base_ts->kind != TEMP_FIXED) {
         /* We do not support double-indirect registers.  */
         tcg_debug_assert(!base_ts->indirect_reg);
         base_ts->indirect_base = 1;
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
 TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
 {
     TCGContext *s = tcg_ctx;
+    TCGTempKind kind = temp_local ? TEMP_LOCAL : TEMP_NORMAL;
     TCGTemp *ts;
     int idx, k;
 
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
         ts = &s->temps[idx];
         ts->temp_allocated = 1;
         tcg_debug_assert(ts->base_type == type);
-        tcg_debug_assert(ts->temp_local == temp_local);
+        tcg_debug_assert(ts->kind == kind);
     } else {
         ts = tcg_temp_alloc(s);
         if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_temp_new_internal(TCGType type, bool temp_local)
             ts->base_type = type;
             ts->type = TCG_TYPE_I32;
             ts->temp_allocated = 1;
-            ts->temp_local = temp_local;
+            ts->kind = kind;
 
             tcg_debug_assert(ts2 == ts + 1);
             ts2->base_type = TCG_TYPE_I64;
             ts2->type = TCG_TYPE_I32;
             ts2->temp_allocated = 1;
-            ts2->temp_local = temp_local;
+            ts2->kind = kind;
         } else {
             ts->base_type = type;
             ts->type = type;
             ts->temp_allocated = 1;
-            ts->temp_local = temp_local;
+            ts->kind = kind;
         }
     }
 
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
     }
 #endif
 
-    tcg_debug_assert(ts->temp_global == 0);
+    tcg_debug_assert(ts->kind < TEMP_GLOBAL);
     tcg_debug_assert(ts->temp_allocated != 0);
     ts->temp_allocated = 0;
 
     idx = temp_idx(ts);
-    k = ts->base_type + (ts->temp_local ? TCG_TYPE_COUNT : 0);
+    k = ts->base_type + (ts->kind == TEMP_NORMAL ? 0 : TCG_TYPE_COUNT);
     set_bit(idx, s->free_temps[k].l);
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
 static void tcg_reg_alloc_start(TCGContext *s)
 {
     int i, n;
-    TCGTemp *ts;
 
-    for (i = 0, n = s->nb_globals; i < n; i++) {
-        ts = &s->temps[i];
-        ts->val_type = (ts->fixed_reg ? TEMP_VAL_REG : TEMP_VAL_MEM);
-    }
-    for (n = s->nb_temps; i < n; i++) {
-        ts = &s->temps[i];
-        ts->val_type = (ts->temp_local ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
-        ts->mem_allocated = 0;
-        ts->fixed_reg = 0;
+    for (i = 0, n = s->nb_temps; i < n; i++) {
+        TCGTemp *ts = &s->temps[i];
+        TCGTempVal val = TEMP_VAL_MEM;
+
+        switch (ts->kind) {
+        case TEMP_FIXED:
+            val = TEMP_VAL_REG;
+            break;
+        case TEMP_GLOBAL:
+            break;
+        case TEMP_NORMAL:
+            val = TEMP_VAL_DEAD;
+            /* fall through */
+        case TEMP_LOCAL:
+            ts->mem_allocated = 0;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        ts->val_type = val;
     }
 
     memset(s->reg_to_temp, 0, sizeof(s->reg_to_temp));
@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
 {
     int idx = temp_idx(ts);
 
-    if (ts->temp_global) {
+    switch (ts->kind) {
+    case TEMP_FIXED:
+    case TEMP_GLOBAL:
         pstrcpy(buf, buf_size, ts->name);
-    } else if (ts->temp_local) {
+        break;
+    case TEMP_LOCAL:
         snprintf(buf, buf_size, "loc%d", idx - s->nb_globals);
-    } else {
+        break;
+    case TEMP_NORMAL:
         snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
+        break;
     }
     return buf;
 }
@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
 {
     int i;
 
-    for (i = 0; i < ng; ++i) {
-        s->temps[i].state = TS_DEAD | TS_MEM;
-        la_reset_pref(&s->temps[i]);
-    }
-    for (i = ng; i < nt; ++i) {
-        s->temps[i].state = (s->temps[i].temp_local
-                             ? TS_DEAD | TS_MEM
-                             : TS_DEAD);
-        la_reset_pref(&s->temps[i]);
+    for (i = 0; i < nt; ++i) {
+        TCGTemp *ts = &s->temps[i];
+        int state;
+
+        switch (ts->kind) {
+        case TEMP_FIXED:
+        case TEMP_GLOBAL:
+        case TEMP_LOCAL:
+            state = TS_DEAD | TS_MEM;
+            break;
+        case TEMP_NORMAL:
+            state = TS_DEAD;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        ts->state = state;
+        la_reset_pref(ts);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void check_regs(TCGContext *s)
     }
     for (k = 0; k < s->nb_temps; k++) {
         ts = &s->temps[k];
-        if (ts->val_type == TEMP_VAL_REG && !ts->fixed_reg
+        if (ts->val_type == TEMP_VAL_REG
+            && ts->kind != TEMP_FIXED
             && s->reg_to_temp[ts->reg] != ts) {
             printf("Inconsistency for temp %s:\n",
                    tcg_get_arg_str_ptr(s, buf, sizeof(buf), ts));
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
    mark it free; otherwise mark it dead.  */
 static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 {
-    if (ts->fixed_reg) {
+    if (ts->kind == TEMP_FIXED) {
         return;
     }
     if (ts->val_type == TEMP_VAL_REG) {
         s->reg_to_temp[ts->reg] = NULL;
     }
     ts->val_type = (free_or_dead < 0
-                    || ts->temp_local
-                    || ts->temp_global
+                    || ts->kind != TEMP_NORMAL
                     ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
 }
 
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
 static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                       TCGRegSet preferred_regs, int free_or_dead)
 {
-    if (ts->fixed_reg) {
+    if (ts->kind == TEMP_FIXED) {
         return;
     }
     if (!ts->mem_coherent) {
@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
 {
     /* The liveness analysis already ensures that globals are back
        in memory. Keep an tcg_debug_assert for safety. */
-    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg);
+    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
+                     || ts->kind == TEMP_FIXED);
 }
 
 /* save globals to their canonical location and assume they can be
@@ -XXX,XX +XXX,XX @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
     for (i = 0, n = s->nb_globals; i < n; i++) {
         TCGTemp *ts = &s->temps[i];
         tcg_debug_assert(ts->val_type != TEMP_VAL_REG
-                         || ts->fixed_reg
+                         || ts->kind == TEMP_FIXED
                          || ts->mem_coherent);
     }
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 
     for (i = s->nb_globals; i < s->nb_temps; i++) {
         TCGTemp *ts = &s->temps[i];
-        if (ts->temp_local) {
+        if (ts->kind == TEMP_LOCAL) {
             temp_save(s, ts, allocated_regs);
         } else {
             /* The liveness analysis already ensures that temps are dead.
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                   TCGRegSet preferred_regs)
 {
     /* ENV should not be modified.  */
-    tcg_debug_assert(!ots->fixed_reg);
+    tcg_debug_assert(ots->kind != TEMP_FIXED);
 
     /* The movi is not explicitly generated here.  */
     if (ots->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
     ts = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(!ots->fixed_reg);
+    tcg_debug_assert(ots->kind != TEMP_FIXED);
 
     /* Note that otype != itype for no-op truncation.  */
     otype = ots->type;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
         }
         temp_dead(s, ots);
     } else {
-        if (IS_DEAD_ARG(1) && !ts->fixed_reg) {
+        if (IS_DEAD_ARG(1) && ts->kind != TEMP_FIXED) {
             /* the mov can be suppressed */
             if (ots->val_type == TEMP_VAL_REG) {
                 s->reg_to_temp[ots->reg] = NULL;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                  * Store the source register into the destination slot
                  * and leave the destination temp as TEMP_VAL_MEM.
                  */
-                assert(!ots->fixed_reg);
+                assert(ots->kind != TEMP_FIXED);
                 if (!ts->mem_allocated) {
                     temp_allocate_frame(s, ots);
                 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
     its = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(!ots->fixed_reg);
+    tcg_debug_assert(ots->kind != TEMP_FIXED);
 
     itype = its->type;
     vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         i_preferred_regs = o_preferred_regs = 0;
         if (arg_ct->ialias) {
             o_preferred_regs = op->output_pref[arg_ct->alias_index];
-            if (ts->fixed_reg) {
+            if (ts->kind == TEMP_FIXED) {
                 /* if fixed register, we must allocate a new register
                    if the alias is not the same register */
                 if (arg != op->args[arg_ct->alias_index]) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             ts = arg_temp(arg);
 
             /* ENV should not be modified.  */
-            tcg_debug_assert(!ts->fixed_reg);
+            tcg_debug_assert(ts->kind != TEMP_FIXED);
 
             if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                 reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         ts = arg_temp(op->args[i]);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(!ts->fixed_reg);
+        tcg_debug_assert(ts->kind != TEMP_FIXED);
 
         if (NEED_SYNC_ARG(i)) {
             temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         ts = arg_temp(arg);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(!ts->fixed_reg);
+        tcg_debug_assert(ts->kind != TEMP_FIXED);
 
         reg = tcg_target_call_oarg_regs[i];
         tcg_debug_assert(s->reg_to_temp[reg] == NULL);
-- 
2.25.1

In most, but not all, places that we check for TEMP_FIXED,
we are really testing that we do not modify the temporary.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  5 +++++
 tcg/tcg.c         | 21 ++++++++++-----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     target_ulong gen_insn_data[TCG_MAX_INSNS][TARGET_INSN_START_WORDS];
 };
 
+static inline bool temp_readonly(TCGTemp *ts)
+{
+    return ts->kind == TEMP_FIXED;
+}
+
 extern TCGContext tcg_init_ctx;
 extern __thread TCGContext *tcg_ctx;
 extern TCGv_env cpu_env;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
    mark it free; otherwise mark it dead.  */
 static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 {
-    if (ts->kind == TEMP_FIXED) {
+    if (temp_readonly(ts)) {
         return;
     }
     if (ts->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
 static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                       TCGRegSet preferred_regs, int free_or_dead)
 {
-    if (ts->kind == TEMP_FIXED) {
+    if (temp_readonly(ts)) {
         return;
     }
     if (!ts->mem_coherent) {
@@ -XXX,XX +XXX,XX @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
 {
     /* The liveness analysis already ensures that globals are back
        in memory. Keep an tcg_debug_assert for safety. */
-    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM
-                     || ts->kind == TEMP_FIXED);
+    tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || temp_readonly(ts));
 }
 
 /* save globals to their canonical location and assume they can be
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                   TCGRegSet preferred_regs)
 {
     /* ENV should not be modified.  */
-    tcg_debug_assert(ots->kind != TEMP_FIXED);
+    tcg_debug_assert(!temp_readonly(ots));
 
     /* The movi is not explicitly generated here.  */
     if (ots->val_type == TEMP_VAL_REG) {
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
     ts = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(ots->kind != TEMP_FIXED);
+    tcg_debug_assert(!temp_readonly(ots));
 
     /* Note that otype != itype for no-op truncation.  */
     otype = ots->type;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                  * Store the source register into the destination slot
                  * and leave the destination temp as TEMP_VAL_MEM.
                  */
-                assert(ots->kind != TEMP_FIXED);
+                assert(!temp_readonly(ots));
                 if (!ts->mem_allocated) {
                     temp_allocate_frame(s, ots);
                 }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
     its = arg_temp(op->args[1]);
 
     /* ENV should not be modified.  */
-    tcg_debug_assert(ots->kind != TEMP_FIXED);
+    tcg_debug_assert(!temp_readonly(ots));
 
     itype = its->type;
     vece = TCGOP_VECE(op);
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             ts = arg_temp(arg);
 
             /* ENV should not be modified.  */
-            tcg_debug_assert(ts->kind != TEMP_FIXED);
+            tcg_debug_assert(!temp_readonly(ts));
 
             if (arg_ct->oalias && !const_args[arg_ct->alias_index]) {
                 reg = new_args[arg_ct->alias_index];
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         ts = arg_temp(op->args[i]);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(ts->kind != TEMP_FIXED);
+        tcg_debug_assert(!temp_readonly(ts));
 
         if (NEED_SYNC_ARG(i)) {
             temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         ts = arg_temp(arg);
 
         /* ENV should not be modified.  */
-        tcg_debug_assert(ts->kind != TEMP_FIXED);
+        tcg_debug_assert(!temp_readonly(ts));
 
         reg = tcg_target_call_oarg_regs[i];
         tcg_debug_assert(s->reg_to_temp[reg] == NULL);
-- 
2.25.1

This will reduce the differences between 32-bit and 64-bit hosts,
allowing full 64-bit constants to be created with the same interface.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h | 2 +-
 tcg/tcg.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef struct TCGTemp {
     unsigned int mem_allocated:1;
     unsigned int temp_allocated:1;
 
-    tcg_target_long val;
+    int64_t val;
     struct TCGTemp *mem_base;
     intptr_t mem_offset;
     const char *name;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void dump_regs(TCGContext *s)
                    tcg_target_reg_names[ts->mem_base->reg]);
             break;
         case TEMP_VAL_CONST:
-            printf("$0x%" TCG_PRIlx, ts->val);
+            printf("$0x%" PRIx64, ts->val);
             break;
         case TEMP_VAL_DEAD:
             printf("D");
-- 
2.25.1

Fix this name vs our coding style.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
         glue(glue(case INDEX_op_, x), _i64):    \
         glue(glue(case INDEX_op_, x), _vec)
 
-struct tcg_temp_info {
+typedef struct TempOptInfo {
     bool is_const;
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     tcg_target_ulong val;
     tcg_target_ulong mask;
-};
+} TempOptInfo;
 
-static inline struct tcg_temp_info *ts_info(TCGTemp *ts)
+static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
 }
 
-static inline struct tcg_temp_info *arg_info(TCGArg arg)
+static inline TempOptInfo *arg_info(TCGArg arg)
 {
     return ts_info(arg_temp(arg));
 }
@@ -XXX,XX +XXX,XX @@ static inline bool ts_is_copy(TCGTemp *ts)
 /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
 static void reset_ts(TCGTemp *ts)
 {
-    struct tcg_temp_info *ti = ts_info(ts);
-    struct tcg_temp_info *pi = ts_info(ti->prev_copy);
-    struct tcg_temp_info *ni = ts_info(ti->next_copy);
+    TempOptInfo *ti = ts_info(ts);
+    TempOptInfo *pi = ts_info(ti->prev_copy);
+    TempOptInfo *ni = ts_info(ti->next_copy);
 
     ni->prev_copy = ti->prev_copy;
     pi->next_copy = ti->next_copy;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(struct tcg_temp_info *infos,
+static void init_ts_info(TempOptInfo *infos,
                          TCGTempSet *temps_used, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     if (!test_bit(idx, temps_used->l)) {
-        struct tcg_temp_info *ti = &infos[idx];
+        TempOptInfo *ti = &infos[idx];
 
         ts->state_ptr = ti;
         ti->next_copy = ts;
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(struct tcg_temp_info *infos,
     }
 }
 
-static void init_arg_info(struct tcg_temp_info *infos,
+static void init_arg_info(TempOptInfo *infos,
                           TCGTempSet *temps_used, TCGArg arg)
 {
     init_ts_info(infos, temps_used, arg_temp(arg));
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
     const TCGOpDef *def;
     TCGOpcode new_op;
     tcg_target_ulong mask;
-    struct tcg_temp_info *di = arg_info(dst);
+    TempOptInfo *di = arg_info(dst);
 
     def = &tcg_op_defs[op->opc];
     if (def->flags & TCG_OPF_VECTOR) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
     const TCGOpDef *def;
-    struct tcg_temp_info *di;
-    struct tcg_temp_info *si;
+    TempOptInfo *di;
+    TempOptInfo *si;
     tcg_target_ulong mask;
     TCGOpcode new_op;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     di->mask = mask;
 
     if (src_ts->type == dst_ts->type) {
-        struct tcg_temp_info *ni = ts_info(si->next_copy);
+        TempOptInfo *ni = ts_info(si->next_copy);
 
         di->next_copy = si->next_copy;
         di->prev_copy = src_ts;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    struct tcg_temp_info *infos;
+    TempOptInfo *infos;
     TCGTempSet temps_used;
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
     bitmap_zero(temps_used.l, nb_temps);
-    infos = tcg_malloc(sizeof(struct tcg_temp_info) * nb_temps);
+    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         tcg_target_ulong mask, partmask, affected;
-- 
2.25.1

This propagates the extended value of TCGTemp.val that we did before.
In addition, it will be required for vector constants.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     bool is_const;
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
-    tcg_target_ulong val;
-    tcg_target_ulong mask;
+    uint64_t val;
+    uint64_t mask;
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
+static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def;
     TCGOpcode new_op;
-    tcg_target_ulong mask;
+    uint64_t mask;
     TempOptInfo *di = arg_info(dst);
 
     def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    tcg_target_ulong mask;
+    uint64_t mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
+static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
 {
     uint64_t l64, h64;
 
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
     }
 }
 
-static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
+static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 {
     const TCGOpDef *def = &tcg_op_defs[op];
-    TCGArg res = do_constant_folding_2(op, x, y);
+    uint64_t res = do_constant_folding_2(op, x, y);
     if (!(def->flags & TCG_OPF_64BIT)) {
         res = (int32_t)res;
     }
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
 static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
                                        TCGArg y, TCGCond c)
 {
-    tcg_target_ulong xv = arg_info(x)->val;
-    tcg_target_ulong yv = arg_info(y)->val;
+    uint64_t xv = arg_info(x)->val;
+    uint64_t yv = arg_info(y)->val;
+
     if (arg_is_const(x) && arg_is_const(y)) {
         const TCGOpDef *def = &tcg_op_defs[op];
         tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        tcg_target_ulong mask, partmask, affected;
+        uint64_t mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs, i;
-        TCGArg tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                TCGArg v1 = arg_info(op->args[1])->val;
-                TCGArg v2 = arg_info(op->args[2])->val;
+                uint64_t v1 = arg_info(op->args[1])->val;
+                uint64_t v2 = arg_info(op->args[2])->val;
+                int shr = op->args[3];
 
                 if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> op->args[3]) | (v2 << (64 - op->args[3]));
+                    tmp = (v1 >> shr) | (v2 << (64 - shr));
                 } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> op->args[3]) |
-                                    ((uint32_t)v2 << (32 - op->args[3])));
+                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                    ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, op, op->args[0], tmp);
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 break;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                tcg_target_ulong tv = arg_info(op->args[3])->val;
-                tcg_target_ulong fv = arg_info(op->args[4])->val;
+                uint64_t tv = arg_info(op->args[3])->val;
+                uint64_t fv = arg_info(op->args[4])->val;
                 TCGCond cond = op->args[5];
+
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-- 
2.25.1

These will hold a single constant for the duration of the TB.
They are hashed, so that each value has one temp across the TB.

Not used yet, this is all infrastructure.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |  24 +++++-
 tcg/optimize.c    |  13 +++-
 tcg/tcg.c         | 195 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 188 insertions(+), 44 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ typedef enum TCGTempKind {
     TEMP_GLOBAL,
     /* Temp is in a fixed register. */
     TEMP_FIXED,
+    /* Temp is a fixed constant. */
+    TEMP_CONST,
 } TCGTempKind;
 
 typedef struct TCGTemp {
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
     QSIMPLEQ_HEAD(, TCGOp) plugin_ops;
 #endif
 
+    GHashTable *const_table[TCG_TYPE_COUNT];
     TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
     TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
 
@@ -XXX,XX +XXX,XX @@ struct TCGContext {
 
 static inline bool temp_readonly(TCGTemp *ts)
 {
-    return ts->kind == TEMP_FIXED;
+    return ts->kind >= TEMP_FIXED;
 }
 
 extern TCGContext tcg_init_ctx;
@@ -XXX,XX +XXX,XX @@ TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc);
 
 void tcg_optimize(TCGContext *s);
 
+/* Allocate a new temporary and initialize it with a constant. */
 TCGv_i32 tcg_const_i32(int32_t val);
 TCGv_i64 tcg_const_i64(int64_t val);
 TCGv_i32 tcg_const_local_i32(int32_t val);
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec(TCGType);
 TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
 TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
 
+/*
+ * Locate or create a read-only temporary that is a constant.
+ * This kind of temporary need not and should not be freed.
+ */
+TCGTemp *tcg_constant_internal(TCGType type, int64_t val);
+
+static inline TCGv_i32 tcg_constant_i32(int32_t val)
+{
+    return temp_tcgv_i32(tcg_constant_internal(TCG_TYPE_I32, val));
+}
+
+static inline TCGv_i64 tcg_constant_i64(int64_t val)
+{
+    return temp_tcgv_i64(tcg_constant_internal(TCG_TYPE_I64, val));
+}
+
+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
+
 #if UINTPTR_MAX == UINT32_MAX
 # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
 # define tcg_const_local_ptr(x)  ((TCGv_ptr)tcg_const_local_i32((intptr_t)(x)))
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TempOptInfo *infos,
         ts->state_ptr = ti;
         ti->next_copy = ts;
         ti->prev_copy = ts;
-        ti->is_const = false;
-        ti->mask = -1;
+        if (ts->kind == TEMP_CONST) {
+            ti->is_const = true;
+            ti->val = ti->mask = ts->val;
+            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+                /* High bits of a 32-bit quantity are garbage.  */
+                ti->mask |= ~0xffffffffull;
+            }
+        } else {
+            ti->is_const = false;
+            ti->mask = -1;
+        }
         set_bit(idx, temps_used->l);
     }
 }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGTemp *tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
     bigendian = 1;
 #endif
 
-    if (base_ts->kind != TEMP_FIXED) {
+    switch (base_ts->kind) {
+    case TEMP_FIXED:
+        break;
+    case TEMP_GLOBAL:
         /* We do not support double-indirect registers.  */
         tcg_debug_assert(!base_ts->indirect_reg);
         base_ts->indirect_base = 1;
         s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64
                             ? 2 : 1);
         indirect_reg = 1;
+        break;
+    default:
+        g_assert_not_reached();
     }
 
     if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
     TCGContext *s = tcg_ctx;
     int k, idx;
 
+    /* In order to simplify users of tcg_constant_*, silently ignore free. */
+    if (ts->kind == TEMP_CONST) {
+        return;
+    }
+
 #if defined(CONFIG_DEBUG_TCG)
     s->temps_in_use--;
     if (s->temps_in_use < 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_temp_free_internal(TCGTemp *ts)
     set_bit(idx, s->free_temps[k].l);
 }
 
+TCGTemp *tcg_constant_internal(TCGType type, int64_t val)
+{
+    TCGContext *s = tcg_ctx;
+    GHashTable *h = s->const_table[type];
+    TCGTemp *ts;
+
+    if (h == NULL) {
+        h = g_hash_table_new(g_int64_hash, g_int64_equal);
+        s->const_table[type] = h;
+    }
+
+    ts = g_hash_table_lookup(h, &val);
+    if (ts == NULL) {
+        ts = tcg_temp_alloc(s);
+
+        if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
+            TCGTemp *ts2 = tcg_temp_alloc(s);
+
+            ts->base_type = TCG_TYPE_I64;
+            ts->type = TCG_TYPE_I32;
+            ts->kind = TEMP_CONST;
+            ts->temp_allocated = 1;
+            /*
+             * Retain the full value of the 64-bit constant in the low
+             * part, so that the hash table works.  Actual uses will
+             * truncate the value to the low part.
+             */
+            ts->val = val;
+
+            tcg_debug_assert(ts2 == ts + 1);
+            ts2->base_type = TCG_TYPE_I64;
+            ts2->type = TCG_TYPE_I32;
+            ts2->kind = TEMP_CONST;
+            ts2->temp_allocated = 1;
+            ts2->val = val >> 32;
+        } else {
+            ts->base_type = type;
+            ts->type = type;
+            ts->kind = TEMP_CONST;
+            ts->temp_allocated = 1;
+            ts->val = val;
+        }
+        g_hash_table_insert(h, &ts->val, ts);
+    }
+
+    return ts;
+}
+
+TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
+{
+    val = dup_const(vece, val);
+    return temp_tcgv_vec(tcg_constant_internal(type, val));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
     TCGv_i32 t0;
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_start(TCGContext *s)
         TCGTempVal val = TEMP_VAL_MEM;
 
         switch (ts->kind) {
+        case TEMP_CONST:
+            val = TEMP_VAL_CONST;
+            break;
         case TEMP_FIXED:
             val = TEMP_VAL_REG;
             break;
@@ -XXX,XX +XXX,XX @@ static char *tcg_get_arg_str_ptr(TCGContext *s, char *buf, int buf_size,
     case TEMP_NORMAL:
         snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals);
         break;
+    case TEMP_CONST:
+        switch (ts->type) {
+        case TCG_TYPE_I32:
+            snprintf(buf, buf_size, "$0x%x", (int32_t)ts->val);
+            break;
+#if TCG_TARGET_REG_BITS > 32
+        case TCG_TYPE_I64:
+            snprintf(buf, buf_size, "$0x%" PRIx64, ts->val);
+            break;
+#endif
+        case TCG_TYPE_V64:
+        case TCG_TYPE_V128:
+        case TCG_TYPE_V256:
+            snprintf(buf, buf_size, "v%d$0x%" PRIx64,
+                     64 << (ts->type - TCG_TYPE_V64), ts->val);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
     }
     return buf;
 }
@@ -XXX,XX +XXX,XX @@ static void la_bb_end(TCGContext *s, int ng, int nt)
             state = TS_DEAD | TS_MEM;
             break;
         case TEMP_NORMAL:
+        case TEMP_CONST:
             state = TS_DEAD;
             break;
         default:
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
    mark it free; otherwise mark it dead.  */
 static void temp_free_or_dead(TCGContext *s, TCGTemp *ts, int free_or_dead)
 {
-    if (temp_readonly(ts)) {
+    TCGTempVal new_type;
+
+    switch (ts->kind) {
+    case TEMP_FIXED:
         return;
+    case TEMP_GLOBAL:
+    case TEMP_LOCAL:
+        new_type = TEMP_VAL_MEM;
+        break;
+    case TEMP_NORMAL:
+        new_type = free_or_dead < 0 ? TEMP_VAL_MEM : TEMP_VAL_DEAD;
+        break;
+    case TEMP_CONST:
+        new_type = TEMP_VAL_CONST;
+        break;
+    default:
+        g_assert_not_reached();
     }
     if (ts->val_type == TEMP_VAL_REG) {
         s->reg_to_temp[ts->reg] = NULL;
     }
-    ts->val_type = (free_or_dead < 0
-                    || ts->kind != TEMP_NORMAL
-                    ? TEMP_VAL_MEM : TEMP_VAL_DEAD);
+    ts->val_type = new_type;
 }
 
 /* Mark a temporary as dead.  */
@@ -XXX,XX +XXX,XX @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
 static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
                       TCGRegSet preferred_regs, int free_or_dead)
 {
-    if (temp_readonly(ts)) {
-        return;
-    }
-    if (!ts->mem_coherent) {
+    if (!temp_readonly(ts) && !ts->mem_coherent) {
         if (!ts->mem_allocated) {
             temp_allocate_frame(s, ts);
         }
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 
     for (i = s->nb_globals; i < s->nb_temps; i++) {
         TCGTemp *ts = &s->temps[i];
-        if (ts->kind == TEMP_LOCAL) {
+
+        switch (ts->kind) {
+        case TEMP_LOCAL:
             temp_save(s, ts, allocated_regs);
-        } else {
+            break;
+        case TEMP_NORMAL:
             /* The liveness analysis already ensures that temps are dead.
                Keep an tcg_debug_assert for safety. */
             tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
+            break;
+        case TEMP_CONST:
+            /* Similarly, we should have freed any allocated register. */
+            tcg_debug_assert(ts->val_type == TEMP_VAL_CONST);
+            break;
+        default:
+            g_assert_not_reached();
         }
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         i_preferred_regs = o_preferred_regs = 0;
         if (arg_ct->ialias) {
             o_preferred_regs = op->output_pref[arg_ct->alias_index];
-            if (ts->kind == TEMP_FIXED) {
-                /* if fixed register, we must allocate a new register
-                   if the alias is not the same register */
-                if (arg != op->args[arg_ct->alias_index]) {
-                    goto allocate_in_reg;
-                }
-            } else {
-                /* if the input is aliased to an output and if it is
-                   not dead after the instruction, we must allocate
-                   a new register and move it */
-                if (!IS_DEAD_ARG(i)) {
-                    goto allocate_in_reg;
-                }
 
-                /* check if the current register has already been allocated
-                   for another input aliased to an output */
-                if (ts->val_type == TEMP_VAL_REG) {
-                    int k2, i2;
-                    reg = ts->reg;
-                    for (k2 = 0 ; k2 < k ; k2++) {
-                        i2 = def->args_ct[nb_oargs + k2].sort_index;
-                        if (def->args_ct[i2].ialias && reg == new_args[i2]) {
-                            goto allocate_in_reg;
-                        }
+            /*
+             * If the input is readonly, then it cannot also be an
+             * output and aliased to itself.  If the input is not
+             * dead after the instruction, we must allocate a new
+             * register and move it.
+             */
+            if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
+                goto allocate_in_reg;
+            }
+
+            /*
+             * Check if the current register has already been allocated
+             * for another input aliased to an output.
+             */
+            if (ts->val_type == TEMP_VAL_REG) {
+                reg = ts->reg;
+                for (int k2 = 0; k2 < k; k2++) {
+                    int i2 = def->args_ct[nb_oargs + k2].sort_index;
+                    if (def->args_ct[i2].ialias && reg == new_args[i2]) {
+                        goto allocate_in_reg;
                     }
                 }
-                i_preferred_regs = o_preferred_regs;
             }
+            i_preferred_regs = o_preferred_regs;
         }
 
         temp_load(s, ts, arg_ct->regs, i_allocated_regs, i_preferred_regs);
         reg = ts->reg;
 
-        if (tcg_regset_test_reg(arg_ct->regs, reg)) {
-            /* nothing to do : the constraint is satisfied */
-        } else {
-        allocate_in_reg:
-            /* allocate a new register matching the constraint 
-               and move the temporary register into it */
+        if (!tcg_regset_test_reg(arg_ct->regs, reg)) {
+ allocate_in_reg:
+            /*
+             * Allocate a new register matching the constraint
+             * and move the temporary register into it.
+             */
             temp_load(s, ts, tcg_target_available_regs[ts->type],
                       i_allocated_regs, 0);
             reg = tcg_reg_alloc(s, arg_ct->regs, i_allocated_regs,
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
     }
 #endif
 
+    for (i = 0; i < TCG_TYPE_COUNT; ++i) {
+        if (s->const_table[i]) {
+            g_hash_table_destroy(s->const_table[i]);
+            s->const_table[i] = NULL;
+        }
+    }
+
     tcg_reg_alloc_start(s);
 
     s->code_buf = tb->tc.ptr;
-- 
2.25.1

Prefer TEMP_CONST over anything else.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arg_info(TempOptInfo *infos,
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
-    TCGTemp *i;
+    TCGTemp *i, *g, *l;
 
-    /* If this is already a global, we can't do better. */
-    if (ts->kind >= TEMP_GLOBAL) {
+    /* If this is already readonly, we can't do better. */
+    if (temp_readonly(ts)) {
         return ts;
     }
 
-    /* Search for a global first. */
+    g = l = NULL;
     for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-        if (i->kind >= TEMP_GLOBAL) {
+        if (temp_readonly(i)) {
             return i;
-        }
-    }
-
-    /* If it is a temp, search for a temp local. */
-    if (ts->kind == TEMP_NORMAL) {
-        for (i = ts_info(ts)->next_copy; i != ts; i = ts_info(i)->next_copy) {
-            if (i->kind >= TEMP_LOCAL) {
-                return i;
+        } else if (i->kind > ts->kind) {
+            if (i->kind == TEMP_GLOBAL) {
+                g = i;
+            } else if (i->kind == TEMP_LOCAL) {
+                l = i;
             }
         }
     }
 
-    /* Failure to find a better representation, return the same temp. */
-    return ts;
+    /* If we didn't find a better representation, return the same temp. */
+    return g ? g : l ? l : ts;
 }
 
 static bool ts_are_copies(TCGTemp *ts1, TCGTemp *ts2)
-- 
2.25.1

Do not allocate a large block for indexing.  Instead, allocate
for each temporary as they are seen.

In general, this will use less memory, if we consider that most
TBs do not touch every target register.  This also allows us to
allocate TempOptInfo for new temps created during optimization.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 60 ++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TempOptInfo *infos,
-                         TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
-    if (!test_bit(idx, temps_used->l)) {
-        TempOptInfo *ti = &infos[idx];
+    TempOptInfo *ti;
 
+    if (test_bit(idx, temps_used->l)) {
+        return;
+    }
+    set_bit(idx, temps_used->l);
+
+    ti = ts->state_ptr;
+    if (ti == NULL) {
+        ti = tcg_malloc(sizeof(TempOptInfo));
         ts->state_ptr = ti;
-        ti->next_copy = ts;
-        ti->prev_copy = ts;
-        if (ts->kind == TEMP_CONST) {
-            ti->is_const = true;
-            ti->val = ti->mask = ts->val;
-            if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-                /* High bits of a 32-bit quantity are garbage.  */
-                ti->mask |= ~0xffffffffull;
-            }
-        } else {
-            ti->is_const = false;
-            ti->mask = -1;
+    }
+
+    ti->next_copy = ts;
+    ti->prev_copy = ts;
+    if (ts->kind == TEMP_CONST) {
+        ti->is_const = true;
+        ti->val = ts->val;
+        ti->mask = ts->val;
+        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+            /* High bits of a 32-bit quantity are garbage.  */
+            ti->mask |= ~0xffffffffull;
         }
-        set_bit(idx, temps_used->l);
+    } else {
+        ti->is_const = false;
+        ti->mask = -1;
     }
 }
 
-static void init_arg_info(TempOptInfo *infos,
-                          TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
 {
-    init_ts_info(infos, temps_used, arg_temp(arg));
+    init_ts_info(temps_used, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals;
+    int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TempOptInfo *infos;
     TCGTempSet temps_used;
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
+
     bitmap_zero(temps_used.l, nb_temps);
-    infos = tcg_malloc(sizeof(TempOptInfo) * nb_temps);
+    for (i = 0; i < nb_temps; ++i) {
+        s->temps[i].state_ptr = NULL;
+    }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs, i;
+        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(infos, &temps_used, ts);
+                    init_ts_info(&temps_used, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(infos, &temps_used, op->args[i]);
+                init_arg_info(&temps_used, op->args[i]);
             }
         }
 
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 108 ++++++++++++++++++++++---------------------------
 1 file changed, 49 insertions(+), 59 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, uint64_t val)
-{
-    const TCGOpDef *def;
-    TCGOpcode new_op;
-    uint64_t mask;
-    TempOptInfo *di = arg_info(dst);
-
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_dupi_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_movi_i64;
-    } else {
-        new_op = INDEX_op_movi_i32;
-    }
-    op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
-    op->args[0] = dst;
-    op->args[1] = val;
-
-    reset_temp(dst);
-    di->is_const = true;
-    di->val = val;
-    mask = val;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_movi_i32) {
-        /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
-    }
-    di->mask = mask;
-}
-
 static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
+static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+                             TCGOp *op, TCGArg dst, uint64_t val)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    TCGType type;
+    TCGTemp *tv;
+
+    if (def->flags & TCG_OPF_VECTOR) {
+        type = TCGOP_VECL(op) + TCG_TYPE_V64;
+    } else if (def->flags & TCG_OPF_64BIT) {
+        type = TCG_TYPE_I64;
+    } else {
+        type = TCG_TYPE_I32;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(type, val);
+    init_ts_info(temps_used, tv);
+    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+}
+
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
 {
     uint64_t l64, h64;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    bitmap_zero(temps_used.l, nb_temps);
+    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         CASE_OP_32_64(movi):
         case INDEX_op_dupi_vec:
-            tcg_opt_gen_movi(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
             break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[1])->val;
                 if (tmp == arg_info(op->args[2])->val) {
-                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                     break;
                 }
             } else if (args_are_copies(op->args[1], op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    bitmap_zero(temps_used.l, nb_temps);
+                    memset(&temps_used, 0, sizeof(temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 uint64_t a = ((uint64_t)ah << 32) | al;
                 uint64_t b = ((uint64_t)bh << 32) | bl;
                 TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
+                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
 
                 if (opc == INDEX_op_add2_i32) {
                     a += b;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 uint32_t b = arg_info(op->args[3])->val;
                 uint64_t r = (uint64_t)a * b;
                 TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32);
+                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    bitmap_zero(temps_used.l, nb_temps);
+                    memset(&temps_used, 0, sizeof(temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                bitmap_zero(temps_used.l, nb_temps);
+                memset(&temps_used, 0, sizeof(temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                bitmap_zero(temps_used.l, nb_temps);
+                memset(&temps_used, 0, sizeof(temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                bitmap_zero(temps_used.l, nb_temps);
+                memset(&temps_used, 0, sizeof(temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Because we now store uint64_t in TCGTemp, we can now always
store the full 64-bit duplicate immediate.  So remove the
difference between 32- and 64-bit hosts.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c   |  9 ++++-----
 tcg/tcg-op-vec.c | 39 ++++++++++-----------------------------
 tcg/tcg.c        |  7 +------
 3 files changed, 15 insertions(+), 40 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[1])->val;
-                if (tmp == arg_info(op->args[2])->val) {
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
-                    break;
-                }
+                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+                                           arg_info(op->args[2])->val));
+                break;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
     }
 }
 
-#define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
-
-static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
-{
-    TCGTemp *rt = tcgv_vec_temp(r);
-    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
-}
-
 TCGv_vec tcg_const_zeros_vec(TCGType type)
 {
     TCGv_vec ret = tcg_temp_new_vec(type);
-    do_dupi_vec(ret, MO_REG, 0);
+    tcg_gen_dupi_vec(MO_64, ret, 0);
     return ret;
 }
 
 TCGv_vec tcg_const_ones_vec(TCGType type)
 {
     TCGv_vec ret = tcg_temp_new_vec(type);
-    do_dupi_vec(ret, MO_REG, -1);
+    tcg_gen_dupi_vec(MO_64, ret, -1);
     return ret;
 }
 
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
 
 void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
 {
-    if (TCG_TARGET_REG_BITS == 64) {
-        do_dupi_vec(r, MO_64, a);
-    } else if (a == dup_const(MO_32, a)) {
-        do_dupi_vec(r, MO_32, a);
-    } else {
-        TCGv_i64 c = tcg_const_i64(a);
-        tcg_gen_dup_i64_vec(MO_64, r, c);
-        tcg_temp_free_i64(c);
-    }
+    tcg_gen_dupi_vec(MO_64, r, a);
 }
 
 void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
+    tcg_gen_dupi_vec(MO_32, r, a);
 }
 
 void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
+    tcg_gen_dupi_vec(MO_16, r, a);
 }
 
 void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
 {
-    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
+    tcg_gen_dupi_vec(MO_8, r, a);
 }
 
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
 {
-    if (vece == MO_64) {
-        tcg_gen_dup64i_vec(r, a);
-    } else {
-        do_dupi_vec(r, MO_REG, dup_const(vece, a));
-    }
+    TCGTemp *rt = tcgv_vec_temp(r);
+    tcg_gen_mov_vec(r, tcg_constant_vec(rt->base_type, vece, a));
 }
 
 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
             if (tcg_can_emit_vec_op(INDEX_op_sari_vec, type, vece) > 0) {
                 tcg_gen_sari_vec(vece, t, a, (8 << vece) - 1);
             } else {
-                do_dupi_vec(t, MO_REG, 0);
-                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a, t);
+                tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a,
+                                tcg_constant_vec(type, vece, 0));
             }
             tcg_gen_xor_vec(vece, r, a, t);
             tcg_gen_sub_vec(vece, r, r, t);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
              * The targets will, in general, have to do this search anyway,
              * do this generically.
              */
-            if (TCG_TARGET_REG_BITS == 32) {
-                val = dup_const(MO_32, val);
-                vece = MO_32;
-            }
             if (val == dup_const(MO_8, val)) {
                 vece = MO_8;
             } else if (val == dup_const(MO_16, val)) {
                 vece = MO_16;
-            } else if (TCG_TARGET_REG_BITS == 64 &&
-                       val == dup_const(MO_32, val)) {
+            } else if (val == dup_const(MO_32, val)) {
                 vece = MO_32;
             }
 
-- 
2.25.1

We must do this before we adjust tcg_out_movi_i32, lest the
under-the-hood poking that we do for icount be broken.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/gen-icount.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -XXX,XX +XXX,XX @@ static inline void gen_io_end(void)
 
 static inline void gen_tb_start(TranslationBlock *tb)
 {
-    TCGv_i32 count, imm;
+    TCGv_i32 count;
 
     tcg_ctx->exitreq_label = gen_new_label();
     if (tb_cflags(tb) & CF_USE_ICOUNT) {
@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(TranslationBlock *tb)
                    offsetof(ArchCPU, env));
 
     if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        imm = tcg_temp_new_i32();
-        /* We emit a movi with a dummy immediate argument. Keep the insn index
-         * of the movi so that we later (when we know the actual insn count)
-         * can update the immediate argument with the actual insn count.  */
-        tcg_gen_movi_i32(imm, 0xdeadbeef);
+        /*
+         * We emit a sub with a dummy immediate argument. Keep the insn index
+         * of the sub so that we later (when we know the actual insn count)
+         * can update the argument with the actual insn count.
+         */
+        tcg_gen_sub_i32(count, count, tcg_constant_i32(0));
         icount_start_insn = tcg_last_op();
-
-        tcg_gen_sub_i32(count, count, imm);
-        tcg_temp_free_i32(imm);
     }
 
     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, tcg_ctx->exitreq_label);
@@ -XXX,XX +XXX,XX @@ static inline void gen_tb_start(TranslationBlock *tb)
 static inline void gen_tb_end(TranslationBlock *tb, int num_insns)
 {
     if (tb_cflags(tb) & CF_USE_ICOUNT) {
-        /* Update the num_insn immediate parameter now that we know
-         * the actual insn count.  */
-        tcg_set_insn_param(icount_start_insn, 1, num_insns);
+        /*
+         * Update the num_insn immediate parameter now that we know
+         * the actual insn count.
+         */
+        tcg_set_insn_param(icount_start_insn, 2,
+                           tcgv_i32_arg(tcg_constant_i32(num_insns)));
     }
 
     gen_set_label(tcg_ctx->exitreq_label);
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h |  13 +--
 tcg/tcg-op.c         | 227 ++++++++++++++++++++-----------------------
 2 files changed, 109 insertions(+), 131 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar);
 
 /* 32 bit ops */
 
+void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg);
 void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
 void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
 void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i32(TCGv_i32 ret, TCGv_i32 arg)
     }
 }
 
-static inline void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
-{
-    tcg_gen_op2i_i32(INDEX_op_movi_i32, ret, arg);
-}
-
 static inline void tcg_gen_ld8u_i32(TCGv_i32 ret, TCGv_ptr arg2,
                                     tcg_target_long offset)
 {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
 
 /* 64 bit ops */
 
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
 void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
 void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
     }
 }
 
-static inline void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
-{
-    tcg_gen_op2i_i64(INDEX_op_movi_i64, ret, arg);
-}
-
 static inline void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2,
                                     tcg_target_long offset)
 {
@@ -XXX,XX +XXX,XX @@ static inline void tcg_gen_sub_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 
 void tcg_gen_discard_i64(TCGv_i64 arg);
 void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg);
-void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg);
 void tcg_gen_ld8u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
 void tcg_gen_ld8s_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
 void tcg_gen_ld16u_i64(TCGv_i64 ret, TCGv_ptr arg2, tcg_target_long offset);
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mb(TCGBar mb_type)
 
 /* 32 bit ops */
 
+void tcg_gen_movi_i32(TCGv_i32 ret, int32_t arg)
+{
+    tcg_gen_mov_i32(ret, tcg_constant_i32(arg));
+}
+
 void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_add_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_add_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2)
         /* Don't recurse with tcg_gen_neg_i32.  */
         tcg_gen_op2_i32(INDEX_op_neg_i32, ret, arg2);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg1);
-        tcg_gen_sub_i32(ret, t0, arg2);
-        tcg_temp_free_i32(t0);
+        tcg_gen_sub_i32(ret, tcg_constant_i32(arg1), arg2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sub_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_sub_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
 void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
-    TCGv_i32 t0;
     /* Some cases can be optimized here.  */
     switch (arg2) {
     case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
         }
         break;
     }
-    t0 = tcg_const_i32(arg2);
-    tcg_gen_and_i32(ret, arg1, t0);
-    tcg_temp_free_i32(t0);
+
+    tcg_gen_and_i32(ret, arg1, tcg_constant_i32(arg2));
 }
 
 void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_or_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_or_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
         /* Don't recurse with tcg_gen_not_i32.  */
         tcg_gen_op2_i32(INDEX_op_not_i32, ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_xor_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_xor_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_shl_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_shr_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_shr_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_sar_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_sar_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcondi_i32(TCGCond cond, TCGv_i32 arg1, int32_t arg2, TCGLabel *l)
     if (cond == TCG_COND_ALWAYS) {
         tcg_gen_br(l);
     } else if (cond != TCG_COND_NEVER) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_brcond_i32(cond, arg1, t0, l);
-        tcg_temp_free_i32(t0);
+        tcg_gen_brcond_i32(cond, arg1, tcg_constant_i32(arg2), l);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i32(TCGCond cond, TCGv_i32 ret,
 void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret,
                           TCGv_i32 arg1, int32_t arg2)
 {
-    TCGv_i32 t0 = tcg_const_i32(arg2);
-    tcg_gen_setcond_i32(cond, ret, arg1, t0);
-    tcg_temp_free_i32(t0);
+    tcg_gen_setcond_i32(cond, ret, arg1, tcg_constant_i32(arg2));
 }
 
 void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     } else if (is_power_of_2(arg2)) {
         tcg_gen_shli_i32(ret, arg1, ctz32(arg2));
     } else {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_mul_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_mul_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 
 void tcg_gen_clzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
 {
-    TCGv_i32 t = tcg_const_i32(arg2);
-    tcg_gen_clz_i32(ret, arg1, t);
-    tcg_temp_free_i32(t);
+    tcg_gen_clz_i32(ret, arg1, tcg_constant_i32(arg2));
 }
 
 void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
             tcg_gen_clzi_i32(t, t, 32);
             tcg_gen_xori_i32(t, t, 31);
         }
-        z = tcg_const_i32(0);
+        z = tcg_constant_i32(0);
         tcg_gen_movcond_i32(TCG_COND_EQ, ret, arg1, z, arg2, t);
         tcg_temp_free_i32(t);
-        tcg_temp_free_i32(z);
     } else {
         gen_helper_ctz_i32(ret, arg1, arg2);
     }
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
         tcg_gen_ctpop_i32(ret, t);
         tcg_temp_free_i32(t);
     } else {
-        TCGv_i32 t = tcg_const_i32(arg2);
-        tcg_gen_ctz_i32(ret, arg1, t);
-        tcg_temp_free_i32(t);
+        tcg_gen_ctz_i32(ret, arg1, tcg_constant_i32(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i32(ret, arg1);
     } else if (TCG_TARGET_HAS_rot_i32) {
-        TCGv_i32 t0 = tcg_const_i32(arg2);
-        tcg_gen_rotl_i32(ret, arg1, t0);
-        tcg_temp_free_i32(t0);
+        tcg_gen_rotl_i32(ret, arg1, tcg_constant_i32(arg2));
     } else {
         TCGv_i32 t0, t1;
         t0 = tcg_temp_new_i32();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
         tcg_gen_andi_i32(ret, arg, (1u << len) - 1);
     } else if (TCG_TARGET_HAS_deposit_i32
                && TCG_TARGET_deposit_i32_valid(ofs, len)) {
-        TCGv_i32 zero = tcg_const_i32(0);
+        TCGv_i32 zero = tcg_constant_i32(0);
         tcg_gen_op5ii_i32(INDEX_op_deposit_i32, ret, zero, arg, ofs, len);
-        tcg_temp_free_i32(zero);
     } else {
         /* To help two-operand hosts we prefer to zero-extend first,
            which allows ARG to stay live.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
     } else {
         TCGv_i32 t0 = tcg_temp_new_i32();
         TCGv_i32 t1 = tcg_temp_new_i32();
-        TCGv_i32 t2 = tcg_const_i32(0x00ff00ff);
+        TCGv_i32 t2 = tcg_constant_i32(0x00ff00ff);
 
                                         /* arg = abcd */
         tcg_gen_shri_i32(t0, arg, 8);   /*  t0 = .abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
 
         tcg_temp_free_i32(t0);
         tcg_temp_free_i32(t1);
-        tcg_temp_free_i32(t2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_discard_i64(TCGv_i64 arg)
 
 void tcg_gen_mov_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
-    tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
-    tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    TCGTemp *ts = tcgv_i64_temp(arg);
+
+    /* Canonicalize TCGv_i64 TEMP_CONST into TCGv_i32 TEMP_CONST. */
+    if (ts->kind == TEMP_CONST) {
+        tcg_gen_movi_i64(ret, ts->val);
+    } else {
+        tcg_gen_mov_i32(TCGV_LOW(ret), TCGV_LOW(arg));
+        tcg_gen_mov_i32(TCGV_HIGH(ret), TCGV_HIGH(arg));
+    }
 }
 
 void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
     tcg_temp_free_i64(t0);
     tcg_temp_free_i32(t1);
 }
+
+#else
+
+void tcg_gen_movi_i64(TCGv_i64 ret, int64_t arg)
+{
+    tcg_gen_mov_i64(ret, tcg_constant_i64(arg));
+}
+
 #endif /* TCG_TARGET_REG_SIZE == 32 */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_add_i64(ret, arg1, tcg_constant_i64(arg2));
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_add_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_add2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2)
     if (arg1 == 0 && TCG_TARGET_HAS_neg_i64) {
         /* Don't recurse with tcg_gen_neg_i64.  */
         tcg_gen_op2_i64(INDEX_op_neg_i64, ret, arg2);
+    } else if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_sub_i64(ret, tcg_constant_i64(arg1), arg2);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg1);
-        tcg_gen_sub_i64(ret, t0, arg2);
-        tcg_temp_free_i64(t0);
+        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+                         tcg_constant_i32(arg1), tcg_constant_i32(arg1 >> 32),
+                         TCGV_LOW(arg2), TCGV_HIGH(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     /* some cases can be optimized here */
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
+    } else if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_sub_i64(ret, arg1, tcg_constant_i64(arg2));
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sub_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_sub2_i32(TCGV_LOW(ret), TCGV_HIGH(ret),
+                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                         tcg_constant_i32(arg2), tcg_constant_i32(arg2 >> 32));
     }
 }
 
 void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
-    TCGv_i64 t0;
-
     if (TCG_TARGET_REG_BITS == 32) {
         tcg_gen_andi_i32(TCGV_LOW(ret), TCGV_LOW(arg1), arg2);
         tcg_gen_andi_i32(TCGV_HIGH(ret), TCGV_HIGH(arg1), arg2 >> 32);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
         }
         break;
     }
-    t0 = tcg_const_i64(arg2);
-    tcg_gen_and_i64(ret, arg1, t0);
-    tcg_temp_free_i64(t0);
+
+    tcg_gen_and_i64(ret, arg1, tcg_constant_i64(arg2));
 }
 
 void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_or_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_or_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
         /* Don't recurse with tcg_gen_not_i64.  */
         tcg_gen_op2_i64(INDEX_op_not_i64, ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_xor_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_xor_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_shl_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_shr_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_shr_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     } else if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_sar_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_sar_i64(ret, arg1, tcg_constant_i64(arg2));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_brcond_i64(TCGCond cond, TCGv_i64 arg1, TCGv_i64 arg2, TCGLabel *l)
 
 void tcg_gen_brcondi_i64(TCGCond cond, TCGv_i64 arg1, int64_t arg2, TCGLabel *l)
 {
-    if (cond == TCG_COND_ALWAYS) {
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_brcond_i64(cond, arg1, tcg_constant_i64(arg2), l);
+    } else if (cond == TCG_COND_ALWAYS) {
         tcg_gen_br(l);
     } else if (cond != TCG_COND_NEVER) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_brcond_i64(cond, arg1, t0, l);
-        tcg_temp_free_i64(t0);
+        l->refs++;
+        tcg_gen_op6ii_i32(INDEX_op_brcond2_i32,
+                          TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                          tcg_constant_i32(arg2),
+                          tcg_constant_i32(arg2 >> 32),
+                          cond, label_arg(l));
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_setcond_i64(TCGCond cond, TCGv_i64 ret,
 void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret,
                           TCGv_i64 arg1, int64_t arg2)
 {
-    TCGv_i64 t0 = tcg_const_i64(arg2);
-    tcg_gen_setcond_i64(cond, ret, arg1, t0);
-    tcg_temp_free_i64(t0);
+    if (TCG_TARGET_REG_BITS == 64) {
+        tcg_gen_setcond_i64(cond, ret, arg1, tcg_constant_i64(arg2));
+    } else if (cond == TCG_COND_ALWAYS) {
+        tcg_gen_movi_i64(ret, 1);
+    } else if (cond == TCG_COND_NEVER) {
+        tcg_gen_movi_i64(ret, 0);
+    } else {
+        tcg_gen_op6i_i32(INDEX_op_setcond2_i32, TCGV_LOW(ret),
+                         TCGV_LOW(arg1), TCGV_HIGH(arg1),
+                         tcg_constant_i32(arg2),
+                         tcg_constant_i32(arg2 >> 32), cond);
+        tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
+    }
 }
 
 void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
     } else {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
-        TCGv_i64 t2 = tcg_const_i64(0x00ff00ff);
+        TCGv_i64 t2 = tcg_constant_i64(0x00ff00ff);
 
                                         /* arg = ....abcd */
         tcg_gen_shri_i64(t0, arg, 8);   /*  t0 = .....abc */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg)
 
         tcg_temp_free_i64(t0);
         tcg_temp_free_i64(t1);
-        tcg_temp_free_i64(t2);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_clzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
     if (TCG_TARGET_REG_BITS == 32
         && TCG_TARGET_HAS_clz_i32
         && arg2 <= 0xffffffffu) {
-        TCGv_i32 t = tcg_const_i32((uint32_t)arg2 - 32);
-        tcg_gen_clz_i32(t, TCGV_LOW(arg1), t);
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_clzi_i32(t, TCGV_LOW(arg1), arg2 - 32);
         tcg_gen_addi_i32(t, t, 32);
         tcg_gen_clz_i32(TCGV_LOW(ret), TCGV_HIGH(arg1), t);
         tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
         tcg_temp_free_i32(t);
     } else {
-        TCGv_i64 t = tcg_const_i64(arg2);
-        tcg_gen_clz_i64(ret, arg1, t);
-        tcg_temp_free_i64(t);
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_clz_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctz_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
             tcg_gen_clzi_i64(t, t, 64);
             tcg_gen_xori_i64(t, t, 63);
         }
-        z = tcg_const_i64(0);
+        z = tcg_constant_i64(0);
         tcg_gen_movcond_i64(TCG_COND_EQ, ret, arg1, z, arg2, t);
         tcg_temp_free_i64(t);
         tcg_temp_free_i64(z);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
     if (TCG_TARGET_REG_BITS == 32
         && TCG_TARGET_HAS_ctz_i32
         && arg2 <= 0xffffffffu) {
-        TCGv_i32 t32 = tcg_const_i32((uint32_t)arg2 - 32);
-        tcg_gen_ctz_i32(t32, TCGV_HIGH(arg1), t32);
+        TCGv_i32 t32 = tcg_temp_new_i32();
+        tcg_gen_ctzi_i32(t32, TCGV_HIGH(arg1), arg2 - 32);
         tcg_gen_addi_i32(t32, t32, 32);
         tcg_gen_ctz_i32(TCGV_LOW(ret), TCGV_LOW(arg1), t32);
         tcg_gen_movi_i32(TCGV_HIGH(ret), 0);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
         tcg_gen_ctpop_i64(ret, t);
         tcg_temp_free_i64(t);
     } else {
-        TCGv_i64 t64 = tcg_const_i64(arg2);
-        tcg_gen_ctz_i64(ret, arg1, t64);
-        tcg_temp_free_i64(t64);
+        TCGv_i64 t0 = tcg_const_i64(arg2);
+        tcg_gen_ctz_i64(ret, arg1, t0);
+        tcg_temp_free_i64(t0);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
     if (arg2 == 0) {
         tcg_gen_mov_i64(ret, arg1);
     } else if (TCG_TARGET_HAS_rot_i64) {
-        TCGv_i64 t0 = tcg_const_i64(arg2);
-        tcg_gen_rotl_i64(ret, arg1, t0);
-        tcg_temp_free_i64(t0);
+        tcg_gen_rotl_i64(ret, arg1, tcg_constant_i64(arg2));
     } else {
         TCGv_i64 t0, t1;
         t0 = tcg_temp_new_i64();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
         tcg_gen_andi_i64(ret, arg, (1ull << len) - 1);
     } else if (TCG_TARGET_HAS_deposit_i64
                && TCG_TARGET_deposit_i64_valid(ofs, len)) {
-        TCGv_i64 zero = tcg_const_i64(0);
+        TCGv_i64 zero = tcg_constant_i64(0);
         tcg_gen_op5ii_i64(INDEX_op_deposit_i64, ret, zero, arg, ofs, len);
-        tcg_temp_free_i64(zero);
     } else {
         if (TCG_TARGET_REG_BITS == 32) {
             if (ofs >= 32) {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i32(TCGv_i32 retv, TCGv addr, TCGv_i32 cmpv,
 
 #ifdef CONFIG_SOFTMMU
         {
-            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
-            gen(retv, cpu_env, addr, cmpv, newv, oi);
-            tcg_temp_free_i32(oi);
+            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
+            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
         }
 #else
         gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ void tcg_gen_atomic_cmpxchg_i64(TCGv_i64 retv, TCGv addr, TCGv_i64 cmpv,
 
 #ifdef CONFIG_SOFTMMU
         {
-            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop, idx));
-            gen(retv, cpu_env, addr, cmpv, newv, oi);
-            tcg_temp_free_i32(oi);
+            TCGMemOpIdx oi = make_memop_idx(memop, idx);
+            gen(retv, cpu_env, addr, cmpv, newv, tcg_constant_i32(oi));
         }
 #else
         gen(retv, cpu_env, addr, cmpv, newv);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i32(TCGv_i32 ret, TCGv addr, TCGv_i32 val,
 
 #ifdef CONFIG_SOFTMMU
     {
-        TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
-        gen(ret, cpu_env, addr, val, oi);
-        tcg_temp_free_i32(oi);
+        TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
+        gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
     }
 #else
     gen(ret, cpu_env, addr, val);
@@ -XXX,XX +XXX,XX @@ static void do_atomic_op_i64(TCGv_i64 ret, TCGv addr, TCGv_i64 val,
 
 #ifdef CONFIG_SOFTMMU
         {
-            TCGv_i32 oi = tcg_const_i32(make_memop_idx(memop & ~MO_SIGN, idx));
-            gen(ret, cpu_env, addr, val, oi);
-            tcg_temp_free_i32(oi);
+            TCGMemOpIdx oi = make_memop_idx(memop & ~MO_SIGN, idx);
+            gen(ret, cpu_env, addr, val, tcg_constant_i32(oi));
         }
 #else
         gen(ret, cpu_env, addr, val);
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/plugin-gen.c | 49 +++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_extu_i32_i64(TCGOp **begin_op, TCGOp *op)
     if (TCG_TARGET_REG_BITS == 32) {
         /* mov_i32 */
         op = copy_op(begin_op, op, INDEX_op_mov_i32);
-        /* movi_i32 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
+        /* mov_i32 w/ $0 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
     } else {
         /* extu_i32_i64 */
         op = copy_op(begin_op, op, INDEX_op_extu_i32_i64);
@@ -XXX,XX +XXX,XX @@ static TCGOp *copy_mov_i64(TCGOp **begin_op, TCGOp *op)
     return op;
 }
 
-static TCGOp *copy_movi_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
-{
-    if (TCG_TARGET_REG_BITS == 32) {
-        /* 2x movi_i32 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
-        op->args[1] = v;
-
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
-        op->args[1] = v >> 32;
-    } else {
-        /* movi_i64 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i64);
-        op->args[1] = v;
-    }
-    return op;
-}
-
 static TCGOp *copy_const_ptr(TCGOp **begin_op, TCGOp *op, void *ptr)
 {
     if (UINTPTR_MAX == UINT32_MAX) {
-        /* movi_i32 */
-        op = copy_op(begin_op, op, INDEX_op_movi_i32);
-        op->args[1] = (uintptr_t)ptr;
+        /* mov_i32 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op->args[1] = tcgv_i32_arg(tcg_constant_i32((uintptr_t)ptr));
     } else {
-        /* movi_i64 */
-        op = copy_movi_i64(begin_op, op, (uint64_t)(uintptr_t)ptr);
+        /* mov_i64 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i64);
+        op->args[1] = tcgv_i64_arg(tcg_constant_i64((uintptr_t)ptr));
     }
     return op;
 }
 
 static TCGOp *copy_const_i64(TCGOp **begin_op, TCGOp *op, uint64_t v)
 {
-    return copy_movi_i64(begin_op, op, v);
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* 2x mov_i32 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v));
+        op = copy_op(begin_op, op, INDEX_op_mov_i32);
+        op->args[1] = tcgv_i32_arg(tcg_constant_i32(v >> 32));
+    } else {
+        /* mov_i64 */
+        op = copy_op(begin_op, op, INDEX_op_mov_i64);
+        op->args[1] = tcgv_i64_arg(tcg_constant_i64(v));
+    }
+    return op;
 }
 
 static TCGOp *copy_extu_tl_i64(TCGOp **begin_op, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static TCGOp *append_mem_cb(const struct qemu_plugin_dyn_cb *cb,
 
     tcg_debug_assert(type == PLUGIN_GEN_CB_MEM);
 
-    /* const_i32 == movi_i32 ("info", so it remains as is) */
-    op = copy_op(&begin_op, op, INDEX_op_movi_i32);
+    /* const_i32 == mov_i32 ("info", so it remains as is) */
+    op = copy_op(&begin_op, op, INDEX_op_mov_i32);
 
     /* const_ptr */
     op = copy_const_ptr(&begin_op, op, cb->userp);
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg.h |   1 +
 tcg/tcg-op-gvec.c | 125 ++++++++++++++++++----------------------------
 tcg/tcg.c         |   8 +++
 3 files changed, 58 insertions(+), 76 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -XXX,XX +XXX,XX @@ static inline TCGv_i64 tcg_constant_i64(int64_t val)
 }
 
 TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val);
+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val);
 
 #if UINTPTR_MAX == UINT32_MAX
 # define tcg_const_ptr(x)        ((TCGv_ptr)tcg_const_i32((intptr_t)(x)))
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
                         gen_helper_gvec_2 *fn)
 {
     TCGv_ptr a0, a1;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with two vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
                          gen_helper_gvec_2i *fn)
 {
     TCGv_ptr a0, a1;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with three vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         gen_helper_gvec_3 *fn)
 {
     TCGv_ptr a0, a1, a2;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with four vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         int32_t data, gen_helper_gvec_4 *fn)
 {
     TCGv_ptr a0, a1, a2, a3;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with five vector operands.  */
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 {
     TCGv_ptr a0, a1, a2, a3, a4;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
     tcg_temp_free_ptr(a4);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
                         int32_t data, gen_helper_gvec_2_ptr *fn)
 {
     TCGv_ptr a0, a1;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with three vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         int32_t data, gen_helper_gvec_3_ptr *fn)
 {
     TCGv_ptr a0, a1, a2;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a0);
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with four vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         gen_helper_gvec_4_ptr *fn)
 {
     TCGv_ptr a0, a1, a2, a3;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a1);
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
-    tcg_temp_free_i32(desc);
 }
 
 /* Generate a call to a gvec-style helper with five vector operands
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                         gen_helper_gvec_5_ptr *fn)
 {
     TCGv_ptr a0, a1, a2, a3, a4;
-    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+    TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 
     a0 = tcg_temp_new_ptr();
     a1 = tcg_temp_new_ptr();
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
     tcg_temp_free_ptr(a2);
     tcg_temp_free_ptr(a3);
     tcg_temp_free_ptr(a4);
-    tcg_temp_free_i32(desc);
 }
 
 /* Return true if we want to implement something of OPRSZ bytes
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
                 || (TCG_TARGET_REG_BITS == 64
                     && (in_c == 0 || in_c == -1
                         || !check_size_impl(oprsz, 4)))) {
-                t_64 = tcg_const_i64(in_c);
+                t_64 = tcg_constant_i64(in_c);
             } else {
-                t_32 = tcg_const_i32(in_c);
+                t_32 = tcg_constant_i32(in_c);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
     /* Otherwise implement out of line.  */
     t_ptr = tcg_temp_new_ptr();
     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
-    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
+    t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 
     if (vece == MO_64) {
         if (in_64) {
             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
         } else {
-            t_64 = tcg_const_i64(in_c);
+            t_64 = tcg_constant_i64(in_c);
             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
-            tcg_temp_free_i64(t_64);
         }
     } else {
         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
@@ -XXX,XX +XXX,XX @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 
         if (in_32) {
             fns[vece](t_ptr, t_desc, in_32);
-        } else {
+        } else if (in_64) {
             t_32 = tcg_temp_new_i32();
-            if (in_64) {
-                tcg_gen_extrl_i64_i32(t_32, in_64);
-            } else if (vece == MO_8) {
-                tcg_gen_movi_i32(t_32, in_c & 0xff);
-            } else if (vece == MO_16) {
-                tcg_gen_movi_i32(t_32, in_c & 0xffff);
-            } else {
-                tcg_gen_movi_i32(t_32, in_c);
-            }
+            tcg_gen_extrl_i64_i32(t_32, in_64);
             fns[vece](t_ptr, t_desc, t_32);
             tcg_temp_free_i32(t_32);
+        } else {
+            if (vece == MO_8) {
+                in_c &= 0xff;
+            } else if (vece == MO_16) {
+                in_c &= 0xffff;
+            }
+            t_32 = tcg_constant_i32(in_c);
+            fns[vece](t_ptr, t_desc, t_32);
         }
     }
 
     tcg_temp_free_ptr(t_ptr);
-    tcg_temp_free_i32(t_desc);
     return;
 
  done:
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
             if (g->fno) {
                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
             } else {
-                TCGv_i64 tcg_c = tcg_const_i64(c);
+                TCGv_i64 tcg_c = tcg_constant_i64(c);
                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
                                     maxsz, c, g->fnoi);
-                tcg_temp_free_i64(tcg_c);
             }
             oprsz = maxsz;
         }
@@ -XXX,XX +XXX,XX @@ static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
 
 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
     gen_addv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
     gen_addv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(c);
+    TCGv_i64 tmp = tcg_constant_i64(c);
     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
-    tcg_temp_free_i64(tmp);
 }
 
 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
@@ -XXX,XX +XXX,XX @@ static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
 
 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
     gen_subv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
     gen_subv_mask(d, a, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(c);
+    TCGv_i64 tmp = tcg_constant_i64(c);
     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
-    tcg_temp_free_i64(tmp);
 }
 
 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
 {
-    TCGv_i32 max = tcg_const_i32(-1);
+    TCGv_i32 max = tcg_constant_i32(-1);
     tcg_gen_add_i32(d, a, b);
     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
-    tcg_temp_free_i32(max);
 }
 
 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 max = tcg_const_i64(-1);
+    TCGv_i64 max = tcg_constant_i64(-1);
     tcg_gen_add_i64(d, a, b);
     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
-    tcg_temp_free_i64(max);
 }
 
 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
 
 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
 {
-    TCGv_i32 min = tcg_const_i32(0);
+    TCGv_i32 min = tcg_constant_i32(0);
     tcg_gen_sub_i32(d, a, b);
     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
-    tcg_temp_free_i32(min);
 }
 
 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
 {
-    TCGv_i64 min = tcg_const_i64(0);
+    TCGv_i64 min = tcg_constant_i64(0);
     tcg_gen_sub_i64(d, a, b);
     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
-    tcg_temp_free_i64(min);
 }
 
 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -XXX,XX +XXX,XX @@ static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
 
 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
     gen_negv_mask(d, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
 {
-    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+    TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
     gen_negv_mask(d, b, m);
-    tcg_temp_free_i64(m);
 }
 
 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
-    tcg_temp_free_i64(tmp);
 }
 
 static const GVecGen2s gop_xors = {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
                        int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
-    tcg_temp_free_i64(tmp);
 }
 
 static const GVecGen2s gop_ors = {
@@ -XXX,XX +XXX,XX @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
                       int64_t c, uint32_t oprsz, uint32_t maxsz)
 {
-    TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+    TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
-    tcg_temp_free_i64(tmp);
 }
 
 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
                                  TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_shlv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
                                  TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_shrv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
                                  TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_sarv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_rotlv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
@@ -XXX,XX +XXX,XX @@ static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
                                   TCGv_vec a, TCGv_vec b)
 {
     TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
 
-    tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
-    tcg_gen_and_vec(vece, t, t, b);
+    tcg_gen_and_vec(vece, t, b, m);
     tcg_gen_rotrv_vec(vece, d, a, t);
     tcg_temp_free_vec(t);
 }
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_constant_vec(TCGType type, unsigned vece, int64_t val)
     return temp_tcgv_vec(tcg_constant_internal(type, val));
 }
 
+TCGv_vec tcg_constant_vec_matching(TCGv_vec match, unsigned vece, int64_t val)
+{
+    TCGTemp *t = tcgv_vec_temp(match);
+
+    tcg_debug_assert(t->temp_allocated != 0);
+    return tcg_constant_vec(t->base_type, vece, val);
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
     TCGv_i32 t0;
-- 
2.25.1

The normal movi opcodes are going away.  We need something
for TCI to use internally.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h    | 8 ++++++++
 tcg/tci.c                | 4 ++--
 tcg/tci/tcg-target.c.inc | 4 ++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
 #include "tcg-target.opc.h"
 #endif
 
+#ifdef TCG_TARGET_INTERPRETER
+/* These opcodes are only for use between the tci generator and interpreter. */
+DEF(tci_movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
+#if TCG_TARGET_REG_BITS == 64
+DEF(tci_movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
+#endif
+#endif
+
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
             t1 = tci_read_r32(regs, &tb_ptr);
             tci_write_reg32(regs, t0, t1);
             break;
-        case INDEX_op_movi_i32:
+        case INDEX_op_tci_movi_i32:
             t0 = *tb_ptr++;
             t1 = tci_read_i32(&tb_ptr);
             tci_write_reg32(regs, t0, t1);
@@ -XXX,XX +XXX,XX @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
             t1 = tci_read_r64(regs, &tb_ptr);
             tci_write_reg64(regs, t0, t1);
             break;
-        case INDEX_op_movi_i64:
+        case INDEX_op_tci_movi_i64:
             t0 = *tb_ptr++;
             t1 = tci_read_i64(&tb_ptr);
             tci_write_reg64(regs, t0, t1);
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     uint8_t *old_code_ptr = s->code_ptr;
     uint32_t arg32 = arg;
     if (type == TCG_TYPE_I32 || arg == arg32) {
-        tcg_out_op_t(s, INDEX_op_movi_i32);
+        tcg_out_op_t(s, INDEX_op_tci_movi_i32);
         tcg_out_r(s, t0);
         tcg_out32(s, arg32);
     } else {
         tcg_debug_assert(type == TCG_TYPE_I64);
 #if TCG_TARGET_REG_BITS == 64
-        tcg_out_op_t(s, INDEX_op_movi_i64);
+        tcg_out_op_t(s, INDEX_op_tci_movi_i64);
         tcg_out_r(s, t0);
         tcg_out64(s, arg);
 #else
-- 
2.25.1

These are now completely covered by mov from a
TYPE_CONST temporary.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Aleksandar Markovic <aleksandar.qemu.devel@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h        |  3 ---
 tcg/optimize.c               |  4 ----
 tcg/tcg-op-vec.c             |  1 -
 tcg/tcg.c                    | 18 +-----------------
 tcg/aarch64/tcg-target.c.inc |  3 ---
 tcg/arm/tcg-target.c.inc     |  1 -
 tcg/i386/tcg-target.c.inc    |  3 ---
 tcg/mips/tcg-target.c.inc    |  2 --
 tcg/ppc/tcg-target.c.inc     |  3 ---
 tcg/riscv/tcg-target.c.inc   |  2 --
 tcg/s390/tcg-target.c.inc    |  2 --
 tcg/sparc/tcg-target.c.inc   |  2 --
 tcg/tci/tcg-target.c.inc     |  2 --
 13 files changed, 1 insertion(+), 45 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -XXX,XX +XXX,XX @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 DEF(mb, 0, 0, 1, 0)
 
 DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
-DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
 DEF(setcond_i32, 1, 2, 1, 0)
 DEF(movcond_i32, 1, 4, 1, IMPL(TCG_TARGET_HAS_movcond_i32))
 /* load/store */
@@ -XXX,XX +XXX,XX @@ DEF(ctz_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_ctz_i32))
 DEF(ctpop_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ctpop_i32))
 
 DEF(mov_i64, 1, 1, 0, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
-DEF(movi_i64, 1, 0, 1, TCG_OPF_64BIT | TCG_OPF_NOT_PRESENT)
 DEF(setcond_i64, 1, 2, 1, IMPL64)
 DEF(movcond_i64, 1, 4, 1, IMPL64 | IMPL(TCG_TARGET_HAS_movcond_i64))
 /* load/store */
@@ -XXX,XX +XXX,XX @@ DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
 #define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
 
 DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
-DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
 
 DEF(dup_vec, 1, 1, 0, IMPLVEC)
 DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
             break;
-        CASE_OP_32_64(movi):
-        case INDEX_op_dupi_vec:
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], op->args[1]);
-            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ bool tcg_can_emit_vecop_list(const TCGOpcode *list,
         case INDEX_op_xor_vec:
         case INDEX_op_mov_vec:
         case INDEX_op_dup_vec:
-        case INDEX_op_dupi_vec:
         case INDEX_op_dup2_vec:
         case INDEX_op_ld_vec:
         case INDEX_op_st_vec:
diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
         return TCG_TARGET_HAS_goto_ptr;
 
     case INDEX_op_mov_i32:
-    case INDEX_op_movi_i32:
     case INDEX_op_setcond_i32:
     case INDEX_op_brcond_i32:
     case INDEX_op_ld8u_i32:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
         return TCG_TARGET_REG_BITS == 32;
 
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i64:
     case INDEX_op_setcond_i64:
     case INDEX_op_brcond_i64:
     case INDEX_op_ld8u_i64:
@@ -XXX,XX +XXX,XX @@ bool tcg_op_supported(TCGOpcode op)
 
     case INDEX_op_mov_vec:
     case INDEX_op_dup_vec:
-    case INDEX_op_dupi_vec:
     case INDEX_op_dupm_vec:
     case INDEX_op_ld_vec:
     case INDEX_op_st_vec:
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 }
 
 /*
- * Specialized code generation for INDEX_op_movi_*.
+ * Specialized code generation for INDEX_op_mov_* with a constant.
  */
 static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
                                   tcg_target_ulong val, TCGLifeData arg_life,
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
     }
 }
 
-static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
-{
-    TCGTemp *ots = arg_temp(op->args[0]);
-    tcg_target_ulong val = op->args[1];
-
-    tcg_reg_alloc_do_movi(s, ots, val, op->life, op->output_pref[0]);
-}
-
 /*
  * Specialized code generation for INDEX_op_mov_*.
  */
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         case INDEX_op_mov_vec:
             tcg_reg_alloc_mov(s, op);
             break;
-        case INDEX_op_movi_i32:
-        case INDEX_op_movi_i64:
-        case INDEX_op_dupi_vec:
-            tcg_reg_alloc_movi(s, op);
-            break;
         case INDEX_op_dup_vec:
             tcg_reg_alloc_dup(s, op);
             break;
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
 
     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32:  /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:      /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         return;
 
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
-    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         g_assert_not_reached();
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
-    case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
-    case INDEX_op_movi_i64:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
-- 
2.25.1

There are several ways we can expand a vector dup of a 64-bit
element on a 32-bit host.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
     }
 }
 
+static void tcg_reg_alloc_dup2(TCGContext *s, const TCGOp *op)
+{
+    const TCGLifeData arg_life = op->life;
+    TCGTemp *ots, *itsl, *itsh;
+    TCGType vtype = TCGOP_VECL(op) + TCG_TYPE_V64;
+
+    /* This opcode is only valid for 32-bit hosts, for 64-bit elements. */
+    tcg_debug_assert(TCG_TARGET_REG_BITS == 32);
+    tcg_debug_assert(TCGOP_VECE(op) == MO_64);
+
+    ots = arg_temp(op->args[0]);
+    itsl = arg_temp(op->args[1]);
+    itsh = arg_temp(op->args[2]);
+
+    /* ENV should not be modified.  */
+    tcg_debug_assert(!temp_readonly(ots));
+
+    /* Allocate the output register now.  */
+    if (ots->val_type != TEMP_VAL_REG) {
+        TCGRegSet allocated_regs = s->reserved_regs;
+        TCGRegSet dup_out_regs =
+            tcg_op_defs[INDEX_op_dup_vec].args_ct[0].regs;
+
+        /* Make sure to not spill the input registers. */
+        if (!IS_DEAD_ARG(1) && itsl->val_type == TEMP_VAL_REG) {
+            tcg_regset_set_reg(allocated_regs, itsl->reg);
+        }
+        if (!IS_DEAD_ARG(2) && itsh->val_type == TEMP_VAL_REG) {
+            tcg_regset_set_reg(allocated_regs, itsh->reg);
+        }
+
+        ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+                                 op->output_pref[0], ots->indirect_base);
+        ots->val_type = TEMP_VAL_REG;
+        ots->mem_coherent = 0;
+        s->reg_to_temp[ots->reg] = ots;
+    }
+
+    /* Promote dup2 of immediates to dupi_vec. */
+    if (itsl->val_type == TEMP_VAL_CONST && itsh->val_type == TEMP_VAL_CONST) {
+        uint64_t val = deposit64(itsl->val, 32, 32, itsh->val);
+        MemOp vece = MO_64;
+
+        if (val == dup_const(MO_8, val)) {
+            vece = MO_8;
+        } else if (val == dup_const(MO_16, val)) {
+            vece = MO_16;
+        } else if (val == dup_const(MO_32, val)) {
+            vece = MO_32;
+        }
+
+        tcg_out_dupi_vec(s, vtype, vece, ots->reg, val);
+        goto done;
+    }
+
+    /* If the two inputs form one 64-bit value, try dupm_vec. */
+    if (itsl + 1 == itsh &&
+        itsl->base_type == TCG_TYPE_I64 &&
+        itsh->base_type == TCG_TYPE_I64) {
+        if (!itsl->mem_coherent) {
+            temp_sync(s, itsl, s->reserved_regs, 0, 0);
+        }
+        if (!itsl->mem_coherent) {
+            temp_sync(s, itsl, s->reserved_regs, 0, 0);
+        }
+#ifdef HOST_WORDS_BIGENDIAN
+        TCGTemp *its = itsh;
+#else
+        TCGTemp *its = itsl;
+#endif
+        if (tcg_out_dupm_vec(s, vtype, MO_64, ots->reg,
+                             its->mem_base->reg, its->mem_offset)) {
+            goto done;
+        }
+    }
+
+    /* Fall back to generic expansion. */
+    tcg_reg_alloc_op(s, op);
+    return;
+
+ done:
+    if (IS_DEAD_ARG(1)) {
+        temp_dead(s, itsl);
+    }
+    if (IS_DEAD_ARG(2)) {
+        temp_dead(s, itsh);
+    }
+    if (NEED_SYNC_ARG(0)) {
+        temp_sync(s, ots, s->reserved_regs, 0, IS_DEAD_ARG(0));
+    } else if (IS_DEAD_ARG(0)) {
+        temp_dead(s, ots);
+    }
+}
+
 #ifdef TCG_TARGET_STACK_GROWSUP
 #define STACK_DIR(x) (-(x))
 #else
@@ -XXX,XX +XXX,XX @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
         case INDEX_op_dup_vec:
             tcg_reg_alloc_dup(s, op);
             break;
+        case INDEX_op_dup2_vec:
+            tcg_reg_alloc_dup2(s, op);
+            break;
         case INDEX_op_insn_start:
             if (num_insns >= 0) {
                 size_t off = tcg_current_code_size(s);
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
 static void expand_vec_mul(TCGType type, unsigned vece,
                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
 {
-    TCGv_vec t1, t2, t3, t4;
+    TCGv_vec t1, t2, t3, t4, zero;
 
     tcg_debug_assert(vece == MO_8);
 
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
     case TCG_TYPE_V64:
         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
-        tcg_gen_dup16i_vec(t2, 0);
+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
-                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
-                  tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
         tcg_gen_mul_vec(MO_16, t1, t1, t2);
         tcg_gen_shri_vec(MO_16, t1, t1, 8);
         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece,
         t2 = tcg_temp_new_vec(type);
         t3 = tcg_temp_new_vec(type);
         t4 = tcg_temp_new_vec(type);
-        tcg_gen_dup16i_vec(t4, 0);
+        zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
-                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+                  tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
-                  tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+                  tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
-                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+                  tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
-                  tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+                  tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
         tcg_gen_mul_vec(MO_16, t1, t1, t2);
         tcg_gen_mul_vec(MO_16, t3, t3, t4);
         tcg_gen_shri_vec(MO_16, t1, t1, 8);
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
         NEED_UMIN = 8,
         NEED_UMAX = 16,
     };
-    TCGv_vec t1, t2;
+    TCGv_vec t1, t2, t3;
     uint8_t fixup;
 
     switch (cond) {
@@ -XXX,XX +XXX,XX @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
     } else if (fixup & NEED_BIAS) {
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
-        tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
-        tcg_gen_sub_vec(vece, t1, v1, t2);
-        tcg_gen_sub_vec(vece, t2, v2, t2);
+        t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
+        tcg_gen_sub_vec(vece, t1, v1, t3);
+        tcg_gen_sub_vec(vece, t2, v2, t3);
         v1 = t1;
         v2 = t2;
         cond = tcg_signed_cond(cond);
-- 
2.25.1

These interfaces have been replaced by tcg_gen_dupi_vec
and tcg_constant_vec.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h |  4 ----
 tcg/tcg-op-vec.c     | 20 --------------------
 2 files changed, 24 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index XXXXXXX..XXXXXXX 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -XXX,XX +XXX,XX @@ void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
 void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
 void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
-void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
-void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
-void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
-void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
 void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -XXX,XX +XXX,XX @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
     return tcg_const_ones_vec(t->base_type);
 }
 
-void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
-{
-    tcg_gen_dupi_vec(MO_64, r, a);
-}
-
-void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
-{
-    tcg_gen_dupi_vec(MO_32, r, a);
-}
-
-void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
-{
-    tcg_gen_dupi_vec(MO_16, r, a);
-}
-
-void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
-{
-    tcg_gen_dupi_vec(MO_8, r, a);
-}
-
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
 {
     TCGTemp *rt = tcgv_vec_temp(r);
-- 
2.25.1

Improve expand_vec_shi to use sign-extraction for MO_32.
This allows a single VSPLTISB instruction to load all of
the valid shift constants.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 44 ++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 static void expand_vec_shi(TCGType type, unsigned vece, TCGv_vec v0,
                            TCGv_vec v1, TCGArg imm, TCGOpcode opci)
 {
-    TCGv_vec t1 = tcg_temp_new_vec(type);
+    TCGv_vec t1;
 
-    /* Splat w/bytes for xxspltib.  */
-    tcg_gen_dupi_vec(MO_8, t1, imm & ((8 << vece) - 1));
+    if (vece == MO_32) {
+        /*
+         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
+         * So using negative numbers gets us the 4th bit easily.
+         */
+        imm = sextract32(imm, 0, 5);
+    } else {
+        imm &= (8 << vece) - 1;
+    }
+
+    /* Splat w/bytes for xxspltib when 2.07 allows MO_64. */
+    t1 = tcg_constant_vec(type, MO_8, imm);
     vec_gen_3(opci, type, vece, tcgv_vec_arg(v0),
               tcgv_vec_arg(v1), tcgv_vec_arg(t1));
-    tcg_temp_free_vec(t1);
 }
 
 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
 {
     TCGv_vec t1 = tcg_temp_new_vec(type);
     TCGv_vec t2 = tcg_temp_new_vec(type);
-    TCGv_vec t3, t4;
+    TCGv_vec c0, c16;
 
     switch (vece) {
     case MO_8:
@@ -XXX,XX +XXX,XX @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
 
     case MO_32:
         tcg_debug_assert(!have_isa_2_07);
-        t3 = tcg_temp_new_vec(type);
-        t4 = tcg_temp_new_vec(type);
-        tcg_gen_dupi_vec(MO_8, t4, -16);
+        /*
+         * Only 5 bits are significant, and VSPLTISB can represent -16..15.
+         * So using -16 is a quick way to represent 16.
+         */
+        c16 = tcg_constant_vec(type, MO_8, -16);
+        c0 = tcg_constant_vec(type, MO_8, 0);
+
         vec_gen_3(INDEX_op_rotlv_vec, type, MO_32, tcgv_vec_arg(t1),
-                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
+                  tcgv_vec_arg(v2), tcgv_vec_arg(c16));
         vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
                   tcgv_vec_arg(v1), tcgv_vec_arg(v2));
-        tcg_gen_dupi_vec(MO_8, t3, 0);
-        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
-                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
-        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
-                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
-        tcg_gen_add_vec(MO_32, v0, t2, t3);
-        tcg_temp_free_vec(t3);
-        tcg_temp_free_vec(t4);
+        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(c0));
+        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(t1), tcgv_vec_arg(c16));
+        tcg_gen_add_vec(MO_32, v0, t1, t2);
         break;
 
     default:
-- 
2.25.1

Improve rotrv_vec to reduce "t1 = -v2, t2 = t1 + c" to
"t1 = -v, t2 = c - v2".  This avoids a serial dependency
between t1 and t2.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
     va_list va;
-    TCGv_vec v0, v1, v2, t1, t2;
+    TCGv_vec v0, v1, v2, t1, t2, c1;
     TCGArg a2;
 
     va_start(va, a0);
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 
     case INDEX_op_rotlv_vec:
         t1 = tcg_temp_new_vec(type);
-        tcg_gen_dupi_vec(vece, t1, 8 << vece);
-        tcg_gen_sub_vec(vece, t1, v2, t1);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
+        tcg_gen_sub_vec(vece, t1, v2, c1);
         /* Right shifts are negative left shifts for AArch64.  */
         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     case INDEX_op_rotrv_vec:
         t1 = tcg_temp_new_vec(type);
         t2 = tcg_temp_new_vec(type);
+        c1 = tcg_constant_vec(type, vece, 8 << vece);
         tcg_gen_neg_vec(vece, t1, v2);
-        tcg_gen_dupi_vec(vece, t2, 8 << vece);
-        tcg_gen_add_vec(vece, t2, t1, t2);
+        tcg_gen_sub_vec(vece, t2, c1, v2);
         /* Right shifts are negative left shifts for AArch64.  */
         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
-- 
2.25.1

Begin conversion of constraints to pre-validated, read-only entities.
To begin, create a simple method by which sets of TCGTargetOpDef
structures may be declared and used.  This simplifies each host's
tcg_target_op_def function and ensures that we have a collected
set of constraints.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-constr.c.inc | 108 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 tcg/tcg-constr.c.inc

diff --git a/tcg/tcg-constr.c.inc b/tcg/tcg-constr.c.inc
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/tcg-constr.c.inc
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * TCG backend data: operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+/*
+ * Define structures for each set of constraints.
+ */
+
+#define C_PFX1(P, A)                 P##A
+#define C_PFX2(P, A, B)              P##A##_##B
+#define C_PFX3(P, A, B, C)           P##A##_##B##_##C
+#define C_PFX4(P, A, B, C, D)        P##A##_##B##_##C##_##D
+#define C_PFX5(P, A, B, C, D, E)     P##A##_##B##_##C##_##D##_##E
+#define C_PFX6(P, A, B, C, D, E, F)  P##A##_##B##_##C##_##D##_##E##_##F
+
+#define C_O0_I1(I1) \
+    static const TCGTargetOpDef C_PFX1(c_o0_i1_, I1) \
+      = { .args_ct_str = { #I1 } };
+
+#define C_O0_I2(I1, I2) \
+    static const TCGTargetOpDef C_PFX2(c_o0_i2_, I1, I2) \
+      = { .args_ct_str = { #I1, #I2 } };
+
+#define C_O0_I3(I1, I2, I3) \
+    static const TCGTargetOpDef C_PFX3(c_o0_i3_, I1, I2, I3) \
+      = { .args_ct_str = { #I1, #I2, #I3 } };
+
+#define C_O0_I4(I1, I2, I3, I4) \
+    static const TCGTargetOpDef C_PFX4(c_o0_i4_, I1, I2, I3, I4) \
+      = { .args_ct_str = { #I1, #I2, #I3, #I4 } };
+
+#define C_O1_I1(O1, I1) \
+    static const TCGTargetOpDef C_PFX2(c_o1_i1_, O1, I1) \
+      = { .args_ct_str = { #O1, #I1 } };
+
+#define C_O1_I2(O1, I1, I2) \
+    static const TCGTargetOpDef C_PFX3(c_o1_i2_, O1, I1, I2) \
+      = { .args_ct_str = { #O1, #I1, #I2 } };
+
+#define C_O1_I3(O1, I1, I2, I3) \
+    static const TCGTargetOpDef C_PFX4(c_o1_i3_, O1, I1, I2, I3) \
+      = { .args_ct_str = { #O1, #I1, #I2, #I3 } };
+
+#define C_O1_I4(O1, I1, I2, I3, I4) \
+    static const TCGTargetOpDef C_PFX5(c_o1_i4_, O1, I1, I2, I3, I4) \
+      = { .args_ct_str = { #O1, #I1, #I2, #I3, #I4 } };
+
+#define C_N1_I2(O1, I1, I2) \
+    static const TCGTargetOpDef C_PFX3(c_n1_i2_, O1, I1, I2) \
+      = { .args_ct_str = { "&" #O1, #I1, #I2 } };
+
+#define C_O2_I1(O1, O2, I1) \
+    static const TCGTargetOpDef C_PFX3(c_o2_i1_, O1, O2, I1) \
+      = { .args_ct_str = { #O1, #O2, #I1 } };
+
+#define C_O2_I2(O1, O2, I1, I2) \
+    static const TCGTargetOpDef C_PFX4(c_o2_i2_, O1, O2, I1, I2) \
+      = { .args_ct_str = { #O1, #O2, #I1, #I2 } };
+
+#define C_O2_I3(O1, O2, I1, I2, I3) \
+    static const TCGTargetOpDef C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3) \
+      = { .args_ct_str = { #O1, #O2, #I1, #I2, #I3 } };
+
+#define C_O2_I4(O1, O2, I1, I2, I3, I4) \
+    static const TCGTargetOpDef C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4) \
+      = { .args_ct_str = { #O1, #O2, #I1, #I2, #I3, #I4 } };
+
+#include "tcg-target-constr.h"
+
+
+/*
+ * Redefine the macros so that they now reference those structures.
+ * These values should be returned from tcg_target_op_def().
+ */
+
+#undef C_O0_I1
+#undef C_O0_I2
+#undef C_O0_I3
+#undef C_O0_I4
+#undef C_O1_I1
+#undef C_O1_I2
+#undef C_O1_I3
+#undef C_O1_I4
+#undef C_N1_I2
+#undef C_O2_I1
+#undef C_O2_I2
+#undef C_O2_I3
+#undef C_O2_I4
+
+#define C_O0_I1(I1)                     &C_PFX1(c_o0_i1_, I1)
+#define C_O0_I2(I1, I2)                 &C_PFX2(c_o0_i2_, I1, I2)
+#define C_O0_I3(I1, I2, I3)             &C_PFX3(c_o0_i3_, I1, I2, I3)
+#define C_O0_I4(I1, I2, I3, I4)         &C_PFX4(c_o0_i4_, I1, I2, I3, I4)
+
+#define C_O1_I1(O1, I1)                 &C_PFX2(c_o1_i1_, O1, I1)
+#define C_O1_I2(O1, I1, I2)             &C_PFX3(c_o1_i2_, O1, I1, I2)
+#define C_O1_I3(O1, I1, I2, I3)         &C_PFX4(c_o1_i3_, O1, I1, I2, I3)
+#define C_O1_I4(O1, I1, I2, I3, I4)     &C_PFX5(c_o1_i4_, O1, I1, I2, I3, I4)
+
+#define C_N1_I2(O1, I1, I2)             &C_PFX3(c_n1_i2_, O1, I1, I2)
+
+#define C_O2_I1(O1, O2, I1)             &C_PFX3(c_o2_i1_, O1, O2, I1)
+#define C_O2_I2(O1, O2, I1, I2)         &C_PFX4(c_o2_i2_, O1, O2, I1, I2)
+#define C_O2_I3(O1, O2, I1, I2, I3)     &C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3)
+#define C_O2_I4(O1, O2, I1, I2, I3, I4) \
+    &C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4)
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target-constr.h |  55 +++++++++++
 tcg/i386/tcg-target.c.inc    | 187 +++++++++++++----------------------
 2 files changed, 121 insertions(+), 121 deletions(-)
 create mode 100644 tcg/i386/tcg-target-constr.h

diff --git a/tcg/i386/tcg-target-constr.h b/tcg/i386/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/i386/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * i386 target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+
+C_O0_I2(qi, r)
+C_O0_I2(ri, r)
+C_O0_I2(re, r)
+C_O0_I2(r, re)
+C_O0_I2(L, L)
+C_O0_I2(x, r)
+
+C_O0_I3(L, L, L)
+
+C_O0_I4(L, L, L, L)
+C_O0_I4(r, r, ri, ri)
+
+C_O1_I1(r, 0)
+C_O1_I1(r, q)
+C_O1_I1(r, r)
+C_O1_I1(r, L)
+C_O1_I1(x, r)
+C_O1_I1(x, x)
+
+C_O1_I2(r, r, re)
+C_O1_I2(r, 0, r)
+C_O1_I2(r, 0, re)
+C_O1_I2(r, 0, reZ)
+C_O1_I2(r, 0, rI)
+C_O1_I2(r, 0, ri)
+C_O1_I2(r, 0, ci)
+C_O1_I2(r, r, ri)
+C_O1_I2(Q, 0, Q)
+C_O1_I2(q, r, re)
+C_O1_I2(r, L, L)
+C_O1_I2(x, x, x)
+C_N1_I2(r, r, r)
+C_N1_I2(r, r, rW)
+
+C_O1_I3(x, x, x, x)
+
+C_O1_I4(r, r, re, r, 0)
+C_O1_I4(r, r, r, ri, ri)
+
+C_O2_I1(r, r, L)
+
+C_O2_I2(r, r, L, L)
+C_O2_I2(a, d, a, r)
+
+C_O2_I3(a, d, 0, 1, r)
+
+C_O2_I4(r, r, 0, 1, re, re)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
-    static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
-    static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
-    static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
-    static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
-    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
-    static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
-    static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
-    static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
-    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
-    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
-    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
-    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
-    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
-    static const TCGTargetOpDef r_r_L_L
-        = { .args_ct_str = { "r", "r", "L", "L" } };
-    static const TCGTargetOpDef L_L_L_L
-        = { .args_ct_str = { "L", "L", "L", "L" } };
-    static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
-    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
-    static const TCGTargetOpDef x_x_x_x
-        = { .args_ct_str = { "x", "x", "x", "x" } };
-    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ld32u_i64:
     case INDEX_op_ld32s_i64:
     case INDEX_op_ld_i64:
-        return &r_r;
+        return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
-        return &qi_r;
+        return C_O0_I2(qi, r);
+
     case INDEX_op_st16_i32:
     case INDEX_op_st16_i64:
     case INDEX_op_st_i32:
     case INDEX_op_st32_i64:
-        return &ri_r;
+        return C_O0_I2(ri, r);
+
     case INDEX_op_st_i64:
-        return &re_r;
+        return C_O0_I2(re, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_add_i64:
-        return &r_r_re;
+        return C_O1_I2(r, r, re);
+
     case INDEX_op_sub_i32:
     case INDEX_op_sub_i64:
     case INDEX_op_mul_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_or_i64:
     case INDEX_op_xor_i32:
     case INDEX_op_xor_i64:
-        return &r_0_re;
+        return C_O1_I2(r, 0, re);
 
     case INDEX_op_and_i32:
     case INDEX_op_and_i64:
-        {
-            static const TCGTargetOpDef and
-                = { .args_ct_str = { "r", "0", "reZ" } };
-            return &and;
-        }
-        break;
+        return C_O1_I2(r, 0, reZ);
+
     case INDEX_op_andc_i32:
     case INDEX_op_andc_i64:
-        {
-            static const TCGTargetOpDef andc
-                = { .args_ct_str = { "r", "r", "rI" } };
-            return &andc;
-        }
-        break;
+        return C_O1_I2(r, 0, rI);
 
     case INDEX_op_shl_i32:
     case INDEX_op_shl_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shr_i64:
     case INDEX_op_sar_i32:
     case INDEX_op_sar_i64:
-        return have_bmi2 ? &r_r_ri : &r_0_ci;
+        return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
+
     case INDEX_op_rotl_i32:
     case INDEX_op_rotl_i64:
     case INDEX_op_rotr_i32:
     case INDEX_op_rotr_i64:
-        return &r_0_ci;
+        return C_O1_I2(r, 0, ci);
 
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return &r_re;
+        return C_O0_I2(r, re);
 
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_not_i32:
     case INDEX_op_not_i64:
     case INDEX_op_extrh_i64_i32:
-        return &r_0;
+        return C_O1_I1(r, 0);
 
     case INDEX_op_ext8s_i32:
     case INDEX_op_ext8s_i64:
     case INDEX_op_ext8u_i32:
     case INDEX_op_ext8u_i64:
-        return &r_q;
+        return C_O1_I1(r, q);
+
     case INDEX_op_ext16s_i32:
     case INDEX_op_ext16s_i64:
     case INDEX_op_ext16u_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_sextract_i32:
     case INDEX_op_ctpop_i32:
     case INDEX_op_ctpop_i64:
-        return &r_r;
+        return C_O1_I1(r, r);
+
     case INDEX_op_extract2_i32:
     case INDEX_op_extract2_i64:
-        return &r_0_r;
+        return C_O1_I2(r, 0, r);
 
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
-        {
-            static const TCGTargetOpDef dep
-                = { .args_ct_str = { "Q", "0", "Q" } };
-            return &dep;
-        }
+        return C_O1_I2(Q, 0, Q);
+
     case INDEX_op_setcond_i32:
     case INDEX_op_setcond_i64:
-        {
-            static const TCGTargetOpDef setc
-                = { .args_ct_str = { "q", "r", "re" } };
-            return &setc;
-        }
+        return C_O1_I2(q, r, re);
+
     case INDEX_op_movcond_i32:
     case INDEX_op_movcond_i64:
-        {
-            static const TCGTargetOpDef movc
-                = { .args_ct_str = { "r", "r", "re", "r", "0" } };
-            return &movc;
-        }
+        return C_O1_I4(r, r, re, r, 0);
+
     case INDEX_op_div2_i32:
     case INDEX_op_div2_i64:
     case INDEX_op_divu2_i32:
     case INDEX_op_divu2_i64:
-        {
-            static const TCGTargetOpDef div2
-                = { .args_ct_str = { "a", "d", "0", "1", "r" } };
-            return &div2;
-        }
+        return C_O2_I3(a, d, 0, 1, r);
+
     case INDEX_op_mulu2_i32:
     case INDEX_op_mulu2_i64:
     case INDEX_op_muls2_i32:
     case INDEX_op_muls2_i64:
-        {
-            static const TCGTargetOpDef mul2
-                = { .args_ct_str = { "a", "d", "a", "r" } };
-            return &mul2;
-        }
+        return C_O2_I2(a, d, a, r);
+
     case INDEX_op_add2_i32:
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i32:
     case INDEX_op_sub2_i64:
-        {
-            static const TCGTargetOpDef arith2
-                = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
-            return &arith2;
-        }
+        return C_O2_I4(r, r, 0, 1, re, re);
+
     case INDEX_op_ctz_i32:
     case INDEX_op_ctz_i64:
-        {
-            static const TCGTargetOpDef ctz[2] = {
-                { .args_ct_str = { "&r", "r", "r" } },
-                { .args_ct_str = { "&r", "r", "rW" } },
-            };
-            return &ctz[have_bmi1];
-        }
+        return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
+
     case INDEX_op_clz_i32:
     case INDEX_op_clz_i64:
-        {
-            static const TCGTargetOpDef clz[2] = {
-                { .args_ct_str = { "&r", "r", "r" } },
-                { .args_ct_str = { "&r", "r", "rW" } },
-            };
-            return &clz[have_lzcnt];
-        }
+        return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
 
     case INDEX_op_qemu_ld_i32:
-        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
+
     case INDEX_op_qemu_st_i32:
-        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+                ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
+
     case INDEX_op_qemu_ld_i64:
-        return (TCG_TARGET_REG_BITS == 64 ? &r_L
-                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
-                : &r_r_L_L);
+        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
+                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
+                : C_O2_I2(r, r, L, L));
+
     case INDEX_op_qemu_st_i64:
-        return (TCG_TARGET_REG_BITS == 64 ? &L_L
-                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
-                : &L_L_L_L);
+        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
+                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
+                : C_O0_I4(L, L, L, L));
 
     case INDEX_op_brcond2_i32:
-        {
-            static const TCGTargetOpDef b2
-                = { .args_ct_str = { "r", "r", "ri", "ri" } };
-            return &b2;
-        }
+        return C_O0_I4(r, r, ri, ri);
+
     case INDEX_op_setcond2_i32:
-        {
-            static const TCGTargetOpDef s2
-                = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
-            return &s2;
-        }
+        return C_O1_I4(r, r, r, ri, ri);
 
     case INDEX_op_ld_vec:
-    case INDEX_op_st_vec:
     case INDEX_op_dupm_vec:
-        return &x_r;
+        return C_O1_I1(x, r);
+
+    case INDEX_op_st_vec:
+        return C_O0_I2(x, r);
 
     case INDEX_op_add_vec:
     case INDEX_op_sub_vec:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_dup2_vec:
 #endif
-        return &x_x_x;
+        return C_O1_I2(x, x, x);
+
     case INDEX_op_abs_vec:
     case INDEX_op_dup_vec:
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
     case INDEX_op_x86_psrldq_vec:
-        return &x_x;
+        return C_O1_I1(x, x);
+
     case INDEX_op_x86_vpblendvb_vec:
-        return &x_x_x_x;
+        return C_O1_I3(x, x, x, x);
 
     default:
         break;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target-constr.h | 31 ++++++++++++
 tcg/aarch64/tcg-target.c.inc    | 85 +++++++++++----------------------
 2 files changed, 60 insertions(+), 56 deletions(-)
 create mode 100644 tcg/aarch64/tcg-target-constr.h

diff --git a/tcg/aarch64/tcg-target-constr.h b/tcg/aarch64/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/aarch64/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AArch64 target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(lZ, l)
+C_O0_I2(r, rA)
+C_O0_I2(rZ, r)
+C_O0_I2(w, r)
+C_O1_I1(r, l)
+C_O1_I1(r, r)
+C_O1_I1(w, r)
+C_O1_I1(w, w)
+C_O1_I1(w, wr)
+C_O1_I2(r, 0, rZ)
+C_O1_I2(r, r, r)
+C_O1_I2(r, r, rA)
+C_O1_I2(r, r, rAL)
+C_O1_I2(r, r, ri)
+C_O1_I2(r, r, rL)
+C_O1_I2(r, rZ, rZ)
+C_O1_I2(w, 0, w)
+C_O1_I2(w, w, w)
+C_O1_I2(w, w, wN)
+C_O1_I2(w, w, wO)
+C_O1_I2(w, w, wZ)
+C_O1_I3(w, w, w, w)
+C_O1_I4(r, r, rA, rZ, rZ)
+C_O2_I4(r, r, rZ, rZ, rA, rMZ)
diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     va_end(va);
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } };
-    static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } };
-    static const TCGTargetOpDef w_wr = { .args_ct_str = { "w", "wr" } };
-    static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
-    static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } };
-    static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
-    static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } };
-    static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
-    static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } };
-    static const TCGTargetOpDef w_0_w = { .args_ct_str = { "w", "0", "w" } };
-    static const TCGTargetOpDef w_w_wO = { .args_ct_str = { "w", "w", "wO" } };
-    static const TCGTargetOpDef w_w_wN = { .args_ct_str = { "w", "w", "wN" } };
-    static const TCGTargetOpDef w_w_wZ = { .args_ct_str = { "w", "w", "wZ" } };
-    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } };
-    static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } };
-    static const TCGTargetOpDef r_r_rAL
-        = { .args_ct_str = { "r", "r", "rAL" } };
-    static const TCGTargetOpDef dep
-        = { .args_ct_str = { "r", "0", "rZ" } };
-    static const TCGTargetOpDef ext2
-        = { .args_ct_str = { "r", "rZ", "rZ" } };
-    static const TCGTargetOpDef movc
-        = { .args_ct_str = { "r", "r", "rA", "rZ", "rZ" } };
-    static const TCGTargetOpDef add2
-        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rA", "rMZ" } };
-    static const TCGTargetOpDef w_w_w_w
-        = { .args_ct_str = { "w", "w", "w", "w" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_extract_i64:
     case INDEX_op_sextract_i32:
     case INDEX_op_sextract_i64:
-        return &r_r;
+        return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
     case INDEX_op_st16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_st16_i64:
     case INDEX_op_st32_i64:
     case INDEX_op_st_i64:
-        return &rZ_r;
+        return C_O0_I2(rZ, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_add_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_sub_i64:
     case INDEX_op_setcond_i32:
     case INDEX_op_setcond_i64:
-        return &r_r_rA;
+        return C_O1_I2(r, r, rA);
 
     case INDEX_op_mul_i32:
     case INDEX_op_mul_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_remu_i64:
     case INDEX_op_muluh_i64:
     case INDEX_op_mulsh_i64:
-        return &r_r_r;
+        return C_O1_I2(r, r, r);
 
     case INDEX_op_and_i32:
     case INDEX_op_and_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_orc_i64:
     case INDEX_op_eqv_i32:
     case INDEX_op_eqv_i64:
-        return &r_r_rL;
+        return C_O1_I2(r, r, rL);
 
     case INDEX_op_shl_i32:
     case INDEX_op_shr_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_sar_i64:
     case INDEX_op_rotl_i64:
     case INDEX_op_rotr_i64:
-        return &r_r_ri;
+        return C_O1_I2(r, r, ri);
 
     case INDEX_op_clz_i32:
     case INDEX_op_ctz_i32:
     case INDEX_op_clz_i64:
     case INDEX_op_ctz_i64:
-        return &r_r_rAL;
+        return C_O1_I2(r, r, rAL);
 
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return &r_rA;
+        return C_O0_I2(r, rA);
 
     case INDEX_op_movcond_i32:
     case INDEX_op_movcond_i64:
-        return &movc;
+        return C_O1_I4(r, r, rA, rZ, rZ);
 
     case INDEX_op_qemu_ld_i32:
     case INDEX_op_qemu_ld_i64:
-        return &r_l;
+        return C_O1_I1(r, l);
     case INDEX_op_qemu_st_i32:
     case INDEX_op_qemu_st_i64:
-        return &lZ_l;
+        return C_O0_I2(lZ, l);
 
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
-        return &dep;
+        return C_O1_I2(r, 0, rZ);
 
     case INDEX_op_extract2_i32:
     case INDEX_op_extract2_i64:
-        return &ext2;
+        return C_O1_I2(r, rZ, rZ);
 
     case INDEX_op_add2_i32:
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i32:
     case INDEX_op_sub2_i64:
-        return &add2;
+        return C_O2_I4(r, r, rZ, rZ, rA, rMZ);
 
     case INDEX_op_add_vec:
     case INDEX_op_sub_vec:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
     case INDEX_op_aa64_sshl_vec:
-        return &w_w_w;
+        return C_O1_I2(w, w, w);
     case INDEX_op_not_vec:
     case INDEX_op_neg_vec:
     case INDEX_op_abs_vec:
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
-        return &w_w;
+        return C_O1_I1(w, w);
     case INDEX_op_ld_vec:
-    case INDEX_op_st_vec:
     case INDEX_op_dupm_vec:
-        return &w_r;
+        return C_O1_I1(w, r);
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
     case INDEX_op_dup_vec:
-        return &w_wr;
+        return C_O1_I1(w, wr);
     case INDEX_op_or_vec:
     case INDEX_op_andc_vec:
-        return &w_w_wO;
+        return C_O1_I2(w, w, wO);
     case INDEX_op_and_vec:
     case INDEX_op_orc_vec:
-        return &w_w_wN;
+        return C_O1_I2(w, w, wN);
     case INDEX_op_cmp_vec:
-        return &w_w_wZ;
+        return C_O1_I2(w, w, wZ);
     case INDEX_op_bitsel_vec:
-        return &w_w_w_w;
+        return C_O1_I3(w, w, w, w);
     case INDEX_op_aa64_sli_vec:
-        return &w_0_w;
+        return C_O1_I2(w, 0, w);
 
     default:
         return NULL;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/arm/tcg-target-constr.h | 30 ++++++++++++
 tcg/arm/tcg-target.c.inc    | 93 +++++++++++++------------------------
 2 files changed, 63 insertions(+), 60 deletions(-)
 create mode 100644 tcg/arm/tcg-target-constr.h

diff --git a/tcg/arm/tcg-target-constr.h b/tcg/arm/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/arm/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARM32 target-specific operand constaints. 
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(r, r)
+C_O0_I2(r, rIN)
+C_O0_I2(s, s)
+C_O0_I3(s, s, s)
+C_O0_I4(r, r, rI, rI)
+C_O0_I4(s, s, s, s)
+C_O1_I1(r, l)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, rZ)
+C_O1_I2(r, l, l)
+C_O1_I2(r, r, r)
+C_O1_I2(r, r, rI)
+C_O1_I2(r, r, rIK)
+C_O1_I2(r, r, rIN)
+C_O1_I2(r, r, ri)
+C_O1_I2(r, rZ, rZ)
+C_O1_I4(r, r, r, rI, rI)
+C_O1_I4(r, r, rIN, rIK, 0)
+C_O2_I1(r, r, l)
+C_O2_I2(r, r, l, l)
+C_O2_I2(r, r, r, r)
+C_O2_I4(r, r, r, r, rIN, rIK)
+C_O2_I4(r, r, rI, rI, rIN, rIK)
diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef s_s = { .args_ct_str = { "s", "s" } };
-    static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
-    static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
-    static const TCGTargetOpDef r_r_l = { .args_ct_str = { "r", "r", "l" } };
-    static const TCGTargetOpDef r_l_l = { .args_ct_str = { "r", "l", "l" } };
-    static const TCGTargetOpDef s_s_s = { .args_ct_str = { "s", "s", "s" } };
-    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_r_rI = { .args_ct_str = { "r", "r", "rI" } };
-    static const TCGTargetOpDef r_r_rIN
-        = { .args_ct_str = { "r", "r", "rIN" } };
-    static const TCGTargetOpDef r_r_rIK
-        = { .args_ct_str = { "r", "r", "rIK" } };
-    static const TCGTargetOpDef r_r_r_r
-        = { .args_ct_str = { "r", "r", "r", "r" } };
-    static const TCGTargetOpDef r_r_l_l
-        = { .args_ct_str = { "r", "r", "l", "l" } };
-    static const TCGTargetOpDef s_s_s_s
-        = { .args_ct_str = { "s", "s", "s", "s" } };
-    static const TCGTargetOpDef br
-        = { .args_ct_str = { "r", "rIN" } };
-    static const TCGTargetOpDef ext2
-        = { .args_ct_str = { "r", "rZ", "rZ" } };
-    static const TCGTargetOpDef dep
-        = { .args_ct_str = { "r", "0", "rZ" } };
-    static const TCGTargetOpDef movc
-        = { .args_ct_str = { "r", "r", "rIN", "rIK", "0" } };
-    static const TCGTargetOpDef add2
-        = { .args_ct_str = { "r", "r", "r", "r", "rIN", "rIK" } };
-    static const TCGTargetOpDef sub2
-        = { .args_ct_str = { "r", "r", "rI", "rI", "rIN", "rIK" } };
-    static const TCGTargetOpDef br2
-        = { .args_ct_str = { "r", "r", "rI", "rI" } };
-    static const TCGTargetOpDef setc2
-        = { .args_ct_str = { "r", "r", "r", "rI", "rI" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
     case INDEX_op_ld16u_i32:
     case INDEX_op_ld16s_i32:
     case INDEX_op_ld_i32:
-    case INDEX_op_st8_i32:
-    case INDEX_op_st16_i32:
-    case INDEX_op_st_i32:
     case INDEX_op_neg_i32:
     case INDEX_op_not_i32:
     case INDEX_op_bswap16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ext16u_i32:
     case INDEX_op_extract_i32:
     case INDEX_op_sextract_i32:
-        return &r_r;
+        return C_O1_I1(r, r);
+
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+        return C_O0_I2(r, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_sub_i32:
     case INDEX_op_setcond_i32:
-        return &r_r_rIN;
+        return C_O1_I2(r, r, rIN);
+
     case INDEX_op_and_i32:
     case INDEX_op_andc_i32:
     case INDEX_op_clz_i32:
     case INDEX_op_ctz_i32:
-        return &r_r_rIK;
+        return C_O1_I2(r, r, rIK);
+
     case INDEX_op_mul_i32:
     case INDEX_op_div_i32:
     case INDEX_op_divu_i32:
-        return &r_r_r;
+        return C_O1_I2(r, r, r);
+
     case INDEX_op_mulu2_i32:
     case INDEX_op_muls2_i32:
-        return &r_r_r_r;
+        return C_O2_I2(r, r, r, r);
+
     case INDEX_op_or_i32:
     case INDEX_op_xor_i32:
-        return &r_r_rI;
+        return C_O1_I2(r, r, rI);
+
     case INDEX_op_shl_i32:
     case INDEX_op_shr_i32:
     case INDEX_op_sar_i32:
     case INDEX_op_rotl_i32:
     case INDEX_op_rotr_i32:
-        return &r_r_ri;
+        return C_O1_I2(r, r, ri);
 
     case INDEX_op_brcond_i32:
-        return &br;
+        return C_O0_I2(r, rIN);
     case INDEX_op_deposit_i32:
-        return &dep;
+        return C_O1_I2(r, 0, rZ);
     case INDEX_op_extract2_i32:
-        return &ext2;
+        return C_O1_I2(r, rZ, rZ);
     case INDEX_op_movcond_i32:
-        return &movc;
+        return C_O1_I4(r, r, rIN, rIK, 0);
     case INDEX_op_add2_i32:
-        return &add2;
+        return C_O2_I4(r, r, r, r, rIN, rIK);
     case INDEX_op_sub2_i32:
-        return &sub2;
+        return C_O2_I4(r, r, rI, rI, rIN, rIK);
     case INDEX_op_brcond2_i32:
-        return &br2;
+        return C_O0_I4(r, r, rI, rI);
     case INDEX_op_setcond2_i32:
-        return &setc2;
+        return C_O1_I4(r, r, r, rI, rI);
 
     case INDEX_op_qemu_ld_i32:
-        return TARGET_LONG_BITS == 32 ? &r_l : &r_l_l;
+        return TARGET_LONG_BITS == 32 ? C_O1_I1(r, l) : C_O1_I2(r, l, l);
     case INDEX_op_qemu_ld_i64:
-        return TARGET_LONG_BITS == 32 ? &r_r_l : &r_r_l_l;
+        return TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, l) : C_O2_I2(r, r, l, l);
     case INDEX_op_qemu_st_i32:
-        return TARGET_LONG_BITS == 32 ? &s_s : &s_s_s;
+        return TARGET_LONG_BITS == 32 ? C_O0_I2(s, s) : C_O0_I3(s, s, s);
     case INDEX_op_qemu_st_i64:
-        return TARGET_LONG_BITS == 32 ? &s_s_s : &s_s_s_s;
+        return TARGET_LONG_BITS == 32 ? C_O0_I3(s, s, s) : C_O0_I4(s, s, s, s);
 
     default:
         return NULL;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/mips/tcg-target-constr.h | 31 ++++++++++++
 tcg/mips/tcg-target.c.inc    | 95 ++++++++++++------------------------
 2 files changed, 61 insertions(+), 65 deletions(-)
 create mode 100644 tcg/mips/tcg-target-constr.h

diff --git a/tcg/mips/tcg-target-constr.h b/tcg/mips/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/mips/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * MIPS target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(rZ, r)
+C_O0_I2(rZ, rZ)
+C_O0_I2(SZ, S)
+C_O0_I3(SZ, S, S)
+C_O0_I3(SZ, SZ, S)
+C_O0_I4(rZ, rZ, rZ, rZ)
+C_O0_I4(SZ, SZ, S, S)
+C_O1_I1(r, L)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, rZ)
+C_O1_I2(r, L, L)
+C_O1_I2(r, r, ri)
+C_O1_I2(r, r, rI)
+C_O1_I2(r, r, rIK)
+C_O1_I2(r, r, rJ)
+C_O1_I2(r, r, rWZ)
+C_O1_I2(r, rZ, rN)
+C_O1_I2(r, rZ, rZ)
+C_O1_I4(r, rZ, rZ, rZ, 0)
+C_O1_I4(r, rZ, rZ, rZ, rZ)
+C_O2_I1(r, r, L)
+C_O2_I2(r, r, L, L)
+C_O2_I2(r, r, r, r)
+C_O2_I4(r, r, rZ, rZ, rN, rN)
diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
-    static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
-    static const TCGTargetOpDef SZ_S = { .args_ct_str = { "SZ", "S" } };
-    static const TCGTargetOpDef rZ_rZ = { .args_ct_str = { "rZ", "rZ" } };
-    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
-    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
-    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_r_rI = { .args_ct_str = { "r", "r", "rI" } };
-    static const TCGTargetOpDef r_r_rJ = { .args_ct_str = { "r", "r", "rJ" } };
-    static const TCGTargetOpDef SZ_S_S = { .args_ct_str = { "SZ", "S", "S" } };
-    static const TCGTargetOpDef SZ_SZ_S
-        = { .args_ct_str = { "SZ", "SZ", "S" } };
-    static const TCGTargetOpDef SZ_SZ_S_S
-        = { .args_ct_str = { "SZ", "SZ", "S", "S" } };
-    static const TCGTargetOpDef r_rZ_rN
-        = { .args_ct_str = { "r", "rZ", "rN" } };
-    static const TCGTargetOpDef r_rZ_rZ
-        = { .args_ct_str = { "r", "rZ", "rZ" } };
-    static const TCGTargetOpDef r_r_rIK
-        = { .args_ct_str = { "r", "r", "rIK" } };
-    static const TCGTargetOpDef r_r_rWZ
-        = { .args_ct_str = { "r", "r", "rWZ" } };
-    static const TCGTargetOpDef r_r_r_r
-        = { .args_ct_str = { "r", "r", "r", "r" } };
-    static const TCGTargetOpDef r_r_L_L
-        = { .args_ct_str = { "r", "r", "L", "L" } };
-    static const TCGTargetOpDef dep
-        = { .args_ct_str = { "r", "0", "rZ" } };
-    static const TCGTargetOpDef movc
-        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "0" } };
-    static const TCGTargetOpDef movc_r6
-        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "rZ" } };
-    static const TCGTargetOpDef add2
-        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rN", "rN" } };
-    static const TCGTargetOpDef br2
-        = { .args_ct_str = { "rZ", "rZ", "rZ", "rZ" } };
-    static const TCGTargetOpDef setc2
-        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "rZ" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_extrl_i64_i32:
     case INDEX_op_extrh_i64_i32:
     case INDEX_op_extract_i64:
-        return &r_r;
+        return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
     case INDEX_op_st16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_st16_i64:
     case INDEX_op_st32_i64:
     case INDEX_op_st_i64:
-        return &rZ_r;
+        return C_O0_I2(rZ, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_add_i64:
-        return &r_r_rJ;
+        return C_O1_I2(r, r, rJ);
     case INDEX_op_sub_i32:
     case INDEX_op_sub_i64:
-        return &r_rZ_rN;
+        return C_O1_I2(r, rZ, rN);
     case INDEX_op_mul_i32:
     case INDEX_op_mulsh_i32:
     case INDEX_op_muluh_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_remu_i64:
     case INDEX_op_nor_i64:
     case INDEX_op_setcond_i64:
-        return &r_rZ_rZ;
+        return C_O1_I2(r, rZ, rZ);
     case INDEX_op_muls2_i32:
     case INDEX_op_mulu2_i32:
     case INDEX_op_muls2_i64:
     case INDEX_op_mulu2_i64:
-        return &r_r_r_r;
+        return C_O2_I2(r, r, r, r);
     case INDEX_op_and_i32:
     case INDEX_op_and_i64:
-        return &r_r_rIK;
+        return C_O1_I2(r, r, rIK);
     case INDEX_op_or_i32:
     case INDEX_op_xor_i32:
     case INDEX_op_or_i64:
     case INDEX_op_xor_i64:
-        return &r_r_rI;
+        return C_O1_I2(r, r, rI);
     case INDEX_op_shl_i32:
     case INDEX_op_shr_i32:
     case INDEX_op_sar_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_sar_i64:
     case INDEX_op_rotr_i64:
     case INDEX_op_rotl_i64:
-        return &r_r_ri;
+        return C_O1_I2(r, r, ri);
     case INDEX_op_clz_i32:
     case INDEX_op_clz_i64:
-        return &r_r_rWZ;
+        return C_O1_I2(r, r, rWZ);
 
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
-        return &dep;
+        return C_O1_I2(r, 0, rZ);
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return &rZ_rZ;
+        return C_O0_I2(rZ, rZ);
     case INDEX_op_movcond_i32:
     case INDEX_op_movcond_i64:
-        return use_mips32r6_instructions ? &movc_r6 : &movc;
-
+        return (use_mips32r6_instructions
+                ? C_O1_I4(r, rZ, rZ, rZ, rZ)
+                : C_O1_I4(r, rZ, rZ, rZ, 0));
     case INDEX_op_add2_i32:
     case INDEX_op_sub2_i32:
-        return &add2;
+        return C_O2_I4(r, r, rZ, rZ, rN, rN);
     case INDEX_op_setcond2_i32:
-        return &setc2;
+        return C_O1_I4(r, rZ, rZ, rZ, rZ);
     case INDEX_op_brcond2_i32:
-        return &br2;
+        return C_O0_I4(rZ, rZ, rZ, rZ);
 
     case INDEX_op_qemu_ld_i32:
         return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-                ? &r_L : &r_L_L);
+                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
     case INDEX_op_qemu_st_i32:
         return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-                ? &SZ_S : &SZ_S_S);
+                ? C_O0_I2(SZ, S) : C_O0_I3(SZ, S, S));
     case INDEX_op_qemu_ld_i64:
-        return (TCG_TARGET_REG_BITS == 64 ? &r_L
-                : TARGET_LONG_BITS == 32 ? &r_r_L : &r_r_L_L);
+        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
+                : TARGET_LONG_BITS == 32 ? C_O2_I1(r, r, L)
+                : C_O2_I2(r, r, L, L));
     case INDEX_op_qemu_st_i64:
-        return (TCG_TARGET_REG_BITS == 64 ? &SZ_S
-                : TARGET_LONG_BITS == 32 ? &SZ_SZ_S : &SZ_SZ_S_S);
+        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(SZ, S)
+                : TARGET_LONG_BITS == 32 ? C_O0_I3(SZ, SZ, S)
+                : C_O0_I4(SZ, SZ, S, S));
 
     default:
         return NULL;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target-constr.h |  37 ++++++++++
 tcg/ppc/tcg-target.c.inc    | 135 +++++++++++++++---------------------
 2 files changed, 94 insertions(+), 78 deletions(-)
 create mode 100644 tcg/ppc/tcg-target-constr.h

diff --git a/tcg/ppc/tcg-target-constr.h b/tcg/ppc/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/ppc/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * PowerPC target-specific operand constaints. 
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(r, r)
+C_O0_I2(r, ri)
+C_O0_I2(S, S)
+C_O0_I2(v, r)
+C_O0_I3(S, S, S)
+C_O0_I4(r, r, ri, ri)
+C_O0_I4(S, S, S, S)
+C_O1_I1(r, L)
+C_O1_I1(r, r)
+C_O1_I1(v, r)
+C_O1_I1(v, v)
+C_O1_I1(v, vr)
+C_O1_I2(r, 0, rZ)
+C_O1_I2(r, L, L)
+C_O1_I2(r, rI, ri)
+C_O1_I2(r, rI, rT)
+C_O1_I2(r, r, r)
+C_O1_I2(r, r, ri)
+C_O1_I2(r, r, rI)
+C_O1_I2(r, r, rT)
+C_O1_I2(r, r, rU)
+C_O1_I2(r, r, rZW)
+C_O1_I2(v, v, v)
+C_O1_I3(v, v, v, v)
+C_O1_I4(r, r, ri, rZ, rZ)
+C_O1_I4(r, r, r, ri, ri)
+C_O2_I1(L, L, L)
+C_O2_I2(L, L, L, L)
+C_O2_I4(r, r, rI, rZM, r, r)
+C_O2_I4(r, r, r, r, rI, rZM)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
     va_end(va);
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
-    static const TCGTargetOpDef S_S = { .args_ct_str = { "S", "S" } };
-    static const TCGTargetOpDef r_ri = { .args_ct_str = { "r", "ri" } };
-    static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
-    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
-    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
-    static const TCGTargetOpDef S_S_S = { .args_ct_str = { "S", "S", "S" } };
-    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_r_rI = { .args_ct_str = { "r", "r", "rI" } };
-    static const TCGTargetOpDef r_r_rT = { .args_ct_str = { "r", "r", "rT" } };
-    static const TCGTargetOpDef r_r_rU = { .args_ct_str = { "r", "r", "rU" } };
-    static const TCGTargetOpDef r_rI_ri
-        = { .args_ct_str = { "r", "rI", "ri" } };
-    static const TCGTargetOpDef r_rI_rT
-        = { .args_ct_str = { "r", "rI", "rT" } };
-    static const TCGTargetOpDef r_r_rZW
-        = { .args_ct_str = { "r", "r", "rZW" } };
-    static const TCGTargetOpDef L_L_L_L
-        = { .args_ct_str = { "L", "L", "L", "L" } };
-    static const TCGTargetOpDef S_S_S_S
-        = { .args_ct_str = { "S", "S", "S", "S" } };
-    static const TCGTargetOpDef movc
-        = { .args_ct_str = { "r", "r", "ri", "rZ", "rZ" } };
-    static const TCGTargetOpDef dep
-        = { .args_ct_str = { "r", "0", "rZ" } };
-    static const TCGTargetOpDef br2
-        = { .args_ct_str = { "r", "r", "ri", "ri" } };
-    static const TCGTargetOpDef setc2
-        = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
-    static const TCGTargetOpDef add2
-        = { .args_ct_str = { "r", "r", "r", "r", "rI", "rZM" } };
-    static const TCGTargetOpDef sub2
-        = { .args_ct_str = { "r", "r", "rI", "rZM", "r", "r" } };
-    static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
-    static const TCGTargetOpDef v_vr = { .args_ct_str = { "v", "vr" } };
-    static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
-    static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v" } };
-    static const TCGTargetOpDef v_v_v_v
-        = { .args_ct_str = { "v", "v", "v", "v" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
     case INDEX_op_ld16u_i32:
     case INDEX_op_ld16s_i32:
     case INDEX_op_ld_i32:
-    case INDEX_op_st8_i32:
-    case INDEX_op_st16_i32:
-    case INDEX_op_st_i32:
     case INDEX_op_ctpop_i32:
     case INDEX_op_neg_i32:
     case INDEX_op_not_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ld32u_i64:
     case INDEX_op_ld32s_i64:
     case INDEX_op_ld_i64:
-    case INDEX_op_st8_i64:
-    case INDEX_op_st16_i64:
-    case INDEX_op_st32_i64:
-    case INDEX_op_st_i64:
     case INDEX_op_ctpop_i64:
     case INDEX_op_neg_i64:
     case INDEX_op_not_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_bswap32_i64:
     case INDEX_op_bswap64_i64:
     case INDEX_op_extract_i64:
-        return &r_r;
+        return C_O1_I1(r, r);
+
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+    case INDEX_op_st8_i64:
+    case INDEX_op_st16_i64:
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i64:
+        return C_O0_I2(r, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_and_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_rotl_i64:
     case INDEX_op_rotr_i64:
     case INDEX_op_setcond_i64:
-        return &r_r_ri;
+        return C_O1_I2(r, r, ri);
+
     case INDEX_op_mul_i32:
     case INDEX_op_mul_i64:
-        return &r_r_rI;
+        return C_O1_I2(r, r, rI);
+
     case INDEX_op_div_i32:
     case INDEX_op_divu_i32:
     case INDEX_op_nand_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_divu_i64:
     case INDEX_op_mulsh_i64:
     case INDEX_op_muluh_i64:
-        return &r_r_r;
+        return C_O1_I2(r, r, r);
+
     case INDEX_op_sub_i32:
-        return &r_rI_ri;
+        return C_O1_I2(r, rI, ri);
     case INDEX_op_add_i64:
-        return &r_r_rT;
+        return C_O1_I2(r, r, rT);
     case INDEX_op_or_i64:
     case INDEX_op_xor_i64:
-        return &r_r_rU;
+        return C_O1_I2(r, r, rU);
     case INDEX_op_sub_i64:
-        return &r_rI_rT;
+        return C_O1_I2(r, rI, rT);
     case INDEX_op_clz_i32:
     case INDEX_op_ctz_i32:
     case INDEX_op_clz_i64:
     case INDEX_op_ctz_i64:
-        return &r_r_rZW;
+        return C_O1_I2(r, r, rZW);
 
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return &r_ri;
+        return C_O0_I2(r, ri);
 
     case INDEX_op_movcond_i32:
     case INDEX_op_movcond_i64:
-        return &movc;
+        return C_O1_I4(r, r, ri, rZ, rZ);
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
-        return &dep;
+        return C_O1_I2(r, 0, rZ);
     case INDEX_op_brcond2_i32:
-        return &br2;
+        return C_O0_I4(r, r, ri, ri);
     case INDEX_op_setcond2_i32:
-        return &setc2;
+        return C_O1_I4(r, r, r, ri, ri);
     case INDEX_op_add2_i64:
     case INDEX_op_add2_i32:
-        return &add2;
+        return C_O2_I4(r, r, r, r, rI, rZM);
     case INDEX_op_sub2_i64:
     case INDEX_op_sub2_i32:
-        return &sub2;
+        return C_O2_I4(r, r, rI, rZM, r, r);
 
     case INDEX_op_qemu_ld_i32:
         return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-                ? &r_L : &r_L_L);
+                ? C_O1_I1(r, L)
+                : C_O1_I2(r, L, L));
+
     case INDEX_op_qemu_st_i32:
         return (TCG_TARGET_REG_BITS == 64 || TARGET_LONG_BITS == 32
-                ? &S_S : &S_S_S);
+                ? C_O0_I2(S, S)
+                : C_O0_I3(S, S, S));
+
     case INDEX_op_qemu_ld_i64:
-        return (TCG_TARGET_REG_BITS == 64 ? &r_L
-                : TARGET_LONG_BITS == 32 ? &L_L_L : &L_L_L_L);
+        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
+                : TARGET_LONG_BITS == 32 ? C_O2_I1(L, L, L)
+                : C_O2_I2(L, L, L, L));
+
     case INDEX_op_qemu_st_i64:
-        return (TCG_TARGET_REG_BITS == 64 ? &S_S
-                : TARGET_LONG_BITS == 32 ? &S_S_S : &S_S_S_S);
+        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(S, S)
+                : TARGET_LONG_BITS == 32 ? C_O0_I3(S, S, S)
+                : C_O0_I4(S, S, S, S));
 
     case INDEX_op_add_vec:
     case INDEX_op_sub_vec:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ppc_mulou_vec:
     case INDEX_op_ppc_pkum_vec:
     case INDEX_op_dup2_vec:
-        return &v_v_v;
+        return C_O1_I2(v, v, v);
+
     case INDEX_op_not_vec:
     case INDEX_op_neg_vec:
-        return &v_v;
+        return C_O1_I1(v, v);
+
     case INDEX_op_dup_vec:
-        return have_isa_3_00 ? &v_vr : &v_v;
+        return have_isa_3_00 ? C_O1_I1(v, vr) : C_O1_I1(v, v);
+
     case INDEX_op_ld_vec:
-    case INDEX_op_st_vec:
     case INDEX_op_dupm_vec:
-        return &v_r;
+        return C_O1_I1(v, r);
+
+    case INDEX_op_st_vec:
+        return C_O0_I2(v, r);
+
     case INDEX_op_bitsel_vec:
     case INDEX_op_ppc_msum_vec:
-        return &v_v_v_v;
+        return C_O1_I3(v, v, v, v);
 
     default:
         return NULL;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target-constr.h | 25 +++++++++++
 tcg/riscv/tcg-target.c.inc    | 82 ++++++++++-------------------------
 2 files changed, 49 insertions(+), 58 deletions(-)
 create mode 100644 tcg/riscv/tcg-target-constr.h

diff --git a/tcg/riscv/tcg-target-constr.h b/tcg/riscv/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/riscv/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RISC-V target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(LZ, L)
+C_O0_I2(rZ, r)
+C_O0_I2(rZ, rZ)
+C_O0_I3(LZ, L, L)
+C_O0_I3(LZ, LZ, L)
+C_O0_I4(LZ, LZ, L, L)
+C_O0_I4(rZ, rZ, rZ, rZ)
+C_O1_I1(r, L)
+C_O1_I1(r, r)
+C_O1_I2(r, L, L)
+C_O1_I2(r, r, ri)
+C_O1_I2(r, r, rI)
+C_O1_I2(r, rZ, rN)
+C_O1_I2(r, rZ, rZ)
+C_O1_I4(r, rZ, rZ, rZ, rZ)
+C_O2_I1(r, r, L)
+C_O2_I2(r, r, L, L)
+C_O2_I4(r, r, rZ, rZ, rM, rM)
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r
-        = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r
-        = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef rZ_r
-        = { .args_ct_str = { "rZ", "r" } };
-    static const TCGTargetOpDef rZ_rZ
-        = { .args_ct_str = { "rZ", "rZ" } };
-    static const TCGTargetOpDef rZ_rZ_rZ_rZ
-        = { .args_ct_str = { "rZ", "rZ", "rZ", "rZ" } };
-    static const TCGTargetOpDef r_r_ri
-        = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_r_rI
-        = { .args_ct_str = { "r", "r", "rI" } };
-    static const TCGTargetOpDef r_rZ_rN
-        = { .args_ct_str = { "r", "rZ", "rN" } };
-    static const TCGTargetOpDef r_rZ_rZ
-        = { .args_ct_str = { "r", "rZ", "rZ" } };
-    static const TCGTargetOpDef r_rZ_rZ_rZ_rZ
-        = { .args_ct_str = { "r", "rZ", "rZ", "rZ", "rZ" } };
-    static const TCGTargetOpDef r_L
-        = { .args_ct_str = { "r", "L" } };
-    static const TCGTargetOpDef r_r_L
-        = { .args_ct_str = { "r", "r", "L" } };
-    static const TCGTargetOpDef r_L_L
-        = { .args_ct_str = { "r", "L", "L" } };
-    static const TCGTargetOpDef r_r_L_L
-        = { .args_ct_str = { "r", "r", "L", "L" } };
-    static const TCGTargetOpDef LZ_L
-        = { .args_ct_str = { "LZ", "L" } };
-    static const TCGTargetOpDef LZ_L_L
-        = { .args_ct_str = { "LZ", "L", "L" } };
-    static const TCGTargetOpDef LZ_LZ_L
-        = { .args_ct_str = { "LZ", "LZ", "L" } };
-    static const TCGTargetOpDef LZ_LZ_L_L
-        = { .args_ct_str = { "LZ", "LZ", "L", "L" } };
-    static const TCGTargetOpDef r_r_rZ_rZ_rM_rM
-        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rM", "rM" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_extrl_i64_i32:
     case INDEX_op_extrh_i64_i32:
     case INDEX_op_ext_i32_i64:
-        return &r_r;
+        return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
     case INDEX_op_st16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_st16_i64:
     case INDEX_op_st32_i64:
     case INDEX_op_st_i64:
-        return &rZ_r;
+        return C_O0_I2(rZ, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_and_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_and_i64:
     case INDEX_op_or_i64:
     case INDEX_op_xor_i64:
-        return &r_r_rI;
+        return C_O1_I2(r, r, rI);
 
     case INDEX_op_sub_i32:
     case INDEX_op_sub_i64:
-        return &r_rZ_rN;
+        return C_O1_I2(r, rZ, rN);
 
     case INDEX_op_mul_i32:
     case INDEX_op_mulsh_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_rem_i64:
     case INDEX_op_remu_i64:
     case INDEX_op_setcond_i64:
-        return &r_rZ_rZ;
+        return C_O1_I2(r, rZ, rZ);
 
     case INDEX_op_shl_i32:
     case INDEX_op_shr_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shl_i64:
     case INDEX_op_shr_i64:
     case INDEX_op_sar_i64:
-        return &r_r_ri;
+        return C_O1_I2(r, r, ri);
 
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return &rZ_rZ;
+        return C_O0_I2(rZ, rZ);
 
     case INDEX_op_add2_i32:
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i32:
     case INDEX_op_sub2_i64:
-        return &r_r_rZ_rZ_rM_rM;
+        return C_O2_I4(r, r, rZ, rZ, rM, rM);
 
     case INDEX_op_brcond2_i32:
-        return &rZ_rZ_rZ_rZ;
+        return C_O0_I4(rZ, rZ, rZ, rZ);
 
     case INDEX_op_setcond2_i32:
-        return &r_rZ_rZ_rZ_rZ;
+        return C_O1_I4(r, rZ, rZ, rZ, rZ);
 
     case INDEX_op_qemu_ld_i32:
-        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+                ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
     case INDEX_op_qemu_st_i32:
-        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &LZ_L : &LZ_L_L;
+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+                ? C_O0_I2(LZ, L) : C_O0_I3(LZ, L, L));
     case INDEX_op_qemu_ld_i64:
-        return TCG_TARGET_REG_BITS == 64 ? &r_L
-               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
-               : &r_r_L_L;
+        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
+               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
+               : C_O2_I2(r, r, L, L));
     case INDEX_op_qemu_st_i64:
-        return TCG_TARGET_REG_BITS == 64 ? &LZ_L
-               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &LZ_LZ_L
-               : &LZ_LZ_L_L;
+        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(LZ, L)
+               : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(LZ, LZ, L)
+               : C_O0_I4(LZ, LZ, L, L));
 
     default:
         return NULL;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390/tcg-target-constr.h |  24 +++++++
 tcg/s390/tcg-target.c.inc    | 119 +++++++++++++++--------------------
 2 files changed, 76 insertions(+), 67 deletions(-)
 create mode 100644 tcg/s390/tcg-target-constr.h

diff --git a/tcg/s390/tcg-target-constr.h b/tcg/s390/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/s390/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * S390 target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(L, L)
+C_O0_I2(r, r)
+C_O0_I2(r, ri)
+C_O1_I1(r, L)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, ri)
+C_O1_I2(r, 0, rI)
+C_O1_I2(r, 0, rJ)
+C_O1_I2(r, r, ri)
+C_O1_I2(r, rZ, r)
+C_O1_I4(r, r, ri, r, 0)
+C_O1_I4(r, r, ri, rI, 0)
+C_O2_I2(b, a, 0, r)
+C_O2_I3(b, a, 0, 1, r)
+C_O2_I4(r, r, 0, 1, rA, r)
+C_O2_I4(r, r, 0, 1, ri, r)
+C_O2_I4(r, r, 0, 1, r, r)
diff --git a/tcg/s390/tcg-target.c.inc b/tcg/s390/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/s390/tcg-target.c.inc
+++ b/tcg/s390/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
-    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
-    static const TCGTargetOpDef r_ri = { .args_ct_str = { "r", "ri" } };
-    static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
-    static const TCGTargetOpDef r_0_ri = { .args_ct_str = { "r", "0", "ri" } };
-    static const TCGTargetOpDef r_0_rI = { .args_ct_str = { "r", "0", "rI" } };
-    static const TCGTargetOpDef r_0_rJ = { .args_ct_str = { "r", "0", "rJ" } };
-    static const TCGTargetOpDef a2_r
-        = { .args_ct_str = { "r", "r", "0", "1", "r", "r" } };
-    static const TCGTargetOpDef a2_ri
-        = { .args_ct_str = { "r", "r", "0", "1", "ri", "r" } };
-    static const TCGTargetOpDef a2_rA
-        = { .args_ct_str = { "r", "r", "0", "1", "rA", "r" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ld32u_i64:
     case INDEX_op_ld32s_i64:
     case INDEX_op_ld_i64:
+        return C_O1_I1(r, r);
+
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
     case INDEX_op_st16_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_st_i32:
     case INDEX_op_st32_i64:
     case INDEX_op_st_i64:
-        return &r_r;
+        return C_O0_I2(r, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_add_i64:
-        return &r_r_ri;
+    case INDEX_op_shl_i64:
+    case INDEX_op_shr_i64:
+    case INDEX_op_sar_i64:
+    case INDEX_op_rotl_i32:
+    case INDEX_op_rotl_i64:
+    case INDEX_op_rotr_i32:
+    case INDEX_op_rotr_i64:
+    case INDEX_op_clz_i64:
+    case INDEX_op_setcond_i32:
+    case INDEX_op_setcond_i64:
+        return C_O1_I2(r, r, ri);
+
     case INDEX_op_sub_i32:
     case INDEX_op_sub_i64:
     case INDEX_op_and_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_or_i64:
     case INDEX_op_xor_i32:
     case INDEX_op_xor_i64:
-        return (s390_facilities & FACILITY_DISTINCT_OPS ? &r_r_ri : &r_0_ri);
+        return (s390_facilities & FACILITY_DISTINCT_OPS
+                ? C_O1_I2(r, r, ri)
+                : C_O1_I2(r, 0, ri));
 
     case INDEX_op_mul_i32:
         /* If we have the general-instruction-extensions, then we have
            MULTIPLY SINGLE IMMEDIATE with a signed 32-bit, otherwise we
            have only MULTIPLY HALFWORD IMMEDIATE, with a signed 16-bit.  */
-        return (s390_facilities & FACILITY_GEN_INST_EXT ? &r_0_ri : &r_0_rI);
+        return (s390_facilities & FACILITY_GEN_INST_EXT
+                ? C_O1_I2(r, 0, ri)
+                : C_O1_I2(r, 0, rI));
+
     case INDEX_op_mul_i64:
-        return (s390_facilities & FACILITY_GEN_INST_EXT ? &r_0_rJ : &r_0_rI);
+        return (s390_facilities & FACILITY_GEN_INST_EXT
+                ? C_O1_I2(r, 0, rJ)
+                : C_O1_I2(r, 0, rI));
 
     case INDEX_op_shl_i32:
     case INDEX_op_shr_i32:
     case INDEX_op_sar_i32:
-        return (s390_facilities & FACILITY_DISTINCT_OPS ? &r_r_ri : &r_0_ri);
-
-    case INDEX_op_shl_i64:
-    case INDEX_op_shr_i64:
-    case INDEX_op_sar_i64:
-        return &r_r_ri;
-
-    case INDEX_op_rotl_i32:
-    case INDEX_op_rotl_i64:
-    case INDEX_op_rotr_i32:
-    case INDEX_op_rotr_i64:
-        return &r_r_ri;
+        return (s390_facilities & FACILITY_DISTINCT_OPS
+                ? C_O1_I2(r, r, ri)
+                : C_O1_I2(r, 0, ri));
 
     case INDEX_op_brcond_i32:
     case INDEX_op_brcond_i64:
-        return &r_ri;
+        return C_O0_I2(r, ri);
 
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_extu_i32_i64:
     case INDEX_op_extract_i32:
     case INDEX_op_extract_i64:
-        return &r_r;
-
-    case INDEX_op_clz_i64:
-    case INDEX_op_setcond_i32:
-    case INDEX_op_setcond_i64:
-        return &r_r_ri;
+        return C_O1_I1(r, r);
 
     case INDEX_op_qemu_ld_i32:
     case INDEX_op_qemu_ld_i64:
-        return &r_L;
+        return C_O1_I1(r, L);
     case INDEX_op_qemu_st_i64:
     case INDEX_op_qemu_st_i32:
-        return &L_L;
+        return C_O0_I2(L, L);
 
     case INDEX_op_deposit_i32:
     case INDEX_op_deposit_i64:
-        {
-            static const TCGTargetOpDef dep
-                = { .args_ct_str = { "r", "rZ", "r" } };
-            return &dep;
-        }
+        return C_O1_I2(r, rZ, r);
+
     case INDEX_op_movcond_i32:
     case INDEX_op_movcond_i64:
-        {
-            static const TCGTargetOpDef movc
-                = { .args_ct_str = { "r", "r", "ri", "r", "0" } };
-            static const TCGTargetOpDef movc_l
-                = { .args_ct_str = { "r", "r", "ri", "rI", "0" } };
-            return (s390_facilities & FACILITY_LOAD_ON_COND2 ? &movc_l : &movc);
-        }
+        return (s390_facilities & FACILITY_LOAD_ON_COND2
+                ? C_O1_I4(r, r, ri, rI, 0)
+                : C_O1_I4(r, r, ri, r, 0));
+
     case INDEX_op_div2_i32:
     case INDEX_op_div2_i64:
     case INDEX_op_divu2_i32:
     case INDEX_op_divu2_i64:
-        {
-            static const TCGTargetOpDef div2
-                = { .args_ct_str = { "b", "a", "0", "1", "r" } };
-            return &div2;
-        }
+        return C_O2_I3(b, a, 0, 1, r);
+
     case INDEX_op_mulu2_i64:
-        {
-            static const TCGTargetOpDef mul2
-                = { .args_ct_str = { "b", "a", "0", "r" } };
-            return &mul2;
-        }
+        return C_O2_I2(b, a, 0, r);
 
     case INDEX_op_add2_i32:
     case INDEX_op_sub2_i32:
-        return (s390_facilities & FACILITY_EXT_IMM ? &a2_ri : &a2_r);
+        return (s390_facilities & FACILITY_EXT_IMM
+                ? C_O2_I4(r, r, 0, 1, ri, r)
+                : C_O2_I4(r, r, 0, 1, r, r));
+
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i64:
-        return (s390_facilities & FACILITY_EXT_IMM ? &a2_rA : &a2_r);
+        return (s390_facilities & FACILITY_EXT_IMM
+                ? C_O2_I4(r, r, 0, 1, rA, r)
+                : C_O2_I4(r, r, 0, 1, r, r));
 
     default:
         break;
-- 
2.25.1

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/sparc/tcg-target-constr.h | 27 +++++++++++++
 tcg/sparc/tcg-target.c.inc    | 74 ++++++++++++-----------------------
 2 files changed, 51 insertions(+), 50 deletions(-)
 create mode 100644 tcg/sparc/tcg-target-constr.h

diff --git a/tcg/sparc/tcg-target-constr.h b/tcg/sparc/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/sparc/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Sparc target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I1(r)
+C_O0_I2(rZ, r)
+C_O0_I2(RZ, r)
+C_O0_I2(rZ, rJ)
+C_O0_I2(RZ, RJ)
+C_O0_I2(sZ, A)
+C_O0_I2(SZ, A)
+C_O1_I1(r, A)
+C_O1_I1(R, A)
+C_O1_I1(r, r)
+C_O1_I1(r, R)
+C_O1_I1(R, r)
+C_O1_I1(R, R)
+C_O1_I2(R, R, R)
+C_O1_I2(r, rZ, rJ)
+C_O1_I2(R, RZ, RJ)
+C_O1_I4(r, rZ, rJ, rI, 0)
+C_O1_I4(R, RZ, RJ, RI, 0)
+C_O2_I2(r, r, rZ, rJ)
+C_O2_I4(R, R, RZ, RZ, RJ, RI)
+C_O2_I4(r, r, rZ, rZ, rJ, rJ)
diff --git a/tcg/sparc/tcg-target.c.inc b/tcg/sparc/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/sparc/tcg-target.c.inc
+++ b/tcg/sparc/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
     }
 }
 
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
-    static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
-    static const TCGTargetOpDef R_r = { .args_ct_str = { "R", "r" } };
-    static const TCGTargetOpDef r_R = { .args_ct_str = { "r", "R" } };
-    static const TCGTargetOpDef R_R = { .args_ct_str = { "R", "R" } };
-    static const TCGTargetOpDef r_A = { .args_ct_str = { "r", "A" } };
-    static const TCGTargetOpDef R_A = { .args_ct_str = { "R", "A" } };
-    static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
-    static const TCGTargetOpDef RZ_r = { .args_ct_str = { "RZ", "r" } };
-    static const TCGTargetOpDef sZ_A = { .args_ct_str = { "sZ", "A" } };
-    static const TCGTargetOpDef SZ_A = { .args_ct_str = { "SZ", "A" } };
-    static const TCGTargetOpDef rZ_rJ = { .args_ct_str = { "rZ", "rJ" } };
-    static const TCGTargetOpDef RZ_RJ = { .args_ct_str = { "RZ", "RJ" } };
-    static const TCGTargetOpDef R_R_R = { .args_ct_str = { "R", "R", "R" } };
-    static const TCGTargetOpDef r_rZ_rJ
-        = { .args_ct_str = { "r", "rZ", "rJ" } };
-    static const TCGTargetOpDef R_RZ_RJ
-        = { .args_ct_str = { "R", "RZ", "RJ" } };
-    static const TCGTargetOpDef r_r_rZ_rJ
-        = { .args_ct_str = { "r", "r", "rZ", "rJ" } };
-    static const TCGTargetOpDef movc_32
-        = { .args_ct_str = { "r", "rZ", "rJ", "rI", "0" } };
-    static const TCGTargetOpDef movc_64
-        = { .args_ct_str = { "R", "RZ", "RJ", "RI", "0" } };
-    static const TCGTargetOpDef add2_32
-        = { .args_ct_str = { "r", "r", "rZ", "rZ", "rJ", "rJ" } };
-    static const TCGTargetOpDef add2_64
-        = { .args_ct_str = { "R", "R", "RZ", "RZ", "RJ", "RI" } };
-
     switch (op) {
     case INDEX_op_goto_ptr:
-        return &r;
+        return C_O0_I1(r);
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8s_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ld_i32:
     case INDEX_op_neg_i32:
     case INDEX_op_not_i32:
-        return &r_r;
+        return C_O1_I1(r, r);
 
     case INDEX_op_st8_i32:
     case INDEX_op_st16_i32:
     case INDEX_op_st_i32:
-        return &rZ_r;
+        return C_O0_I2(rZ, r);
 
     case INDEX_op_add_i32:
     case INDEX_op_mul_i32:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shr_i32:
     case INDEX_op_sar_i32:
     case INDEX_op_setcond_i32:
-        return &r_rZ_rJ;
+        return C_O1_I2(r, rZ, rJ);
 
     case INDEX_op_brcond_i32:
-        return &rZ_rJ;
+        return C_O0_I2(rZ, rJ);
     case INDEX_op_movcond_i32:
-        return &movc_32;
+        return C_O1_I4(r, rZ, rJ, rI, 0);
     case INDEX_op_add2_i32:
     case INDEX_op_sub2_i32:
-        return &add2_32;
+        return C_O2_I4(r, r, rZ, rZ, rJ, rJ);
     case INDEX_op_mulu2_i32:
     case INDEX_op_muls2_i32:
-        return &r_r_rZ_rJ;
+        return C_O2_I2(r, r, rZ, rJ);
 
     case INDEX_op_ld8u_i64:
     case INDEX_op_ld8s_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ld_i64:
     case INDEX_op_ext_i32_i64:
     case INDEX_op_extu_i32_i64:
-        return &R_r;
+        return C_O1_I1(R, r);
 
     case INDEX_op_st8_i64:
     case INDEX_op_st16_i64:
     case INDEX_op_st32_i64:
     case INDEX_op_st_i64:
-        return &RZ_r;
+        return C_O0_I2(RZ, r);
 
     case INDEX_op_add_i64:
     case INDEX_op_mul_i64:
@@ -XXX,XX +XXX,XX @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shr_i64:
     case INDEX_op_sar_i64:
     case INDEX_op_setcond_i64:
-        return &R_RZ_RJ;
+        return C_O1_I2(R, RZ, RJ);
 
     case INDEX_op_neg_i64:
     case INDEX_op_not_i64:
     case INDEX_op_ext32s_i64:
     case INDEX_op_ext32u_i64:
-        return &R_R;
+        return C_O1_I1(R, R);
 
     case INDEX_op_extrl_i64_i32:
     case INDEX_op_extrh_i64_i32:
-        return &r_R;
+        return C_O1_I1(r, R);
 
     case INDEX_op_brcond_i64:
-        return &RZ_RJ;
+        return C_O0_I2(RZ, RJ);
     case INDEX_op_movcond_i64:
-        return &movc_64;
+        return C_O1_I4(R, RZ, RJ, RI, 0);
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i64:
-        return &add2_64;
+        return C_O2_I4(R, R, RZ, RZ, RJ, RI);
     case INDEX_op_muluh_i64:
-        return &R_R_R;
+        return C_O1_I2(R, R, R);
 
     case INDEX_op_qemu_ld_i32:
-        return &r_A;
+        return C_O1_I1(r, A);
     case INDEX_op_qemu_ld_i64:
-        return &R_A;
+        return C_O1_I1(R, A);
     case INDEX_op_qemu_st_i32:
-        return &sZ_A;
+        return C_O0_I2(sZ, A);
     case INDEX_op_qemu_st_i64:
-        return &SZ_A;
+        return C_O0_I2(SZ, A);
 
     default:
         return NULL;
-- 
2.25.1

This does require finishing the conversion to tcg_target_op_def.
Remove quite a lot of ifdefs, since we can reference opcodes
even if they are not implemented.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci/tcg-target-constr.h |  28 +++
 tcg/tci/tcg-target.c.inc    | 360 ++++++++++++++----------------------
 2 files changed, 163 insertions(+), 225 deletions(-)
 create mode 100644 tcg/tci/tcg-target-constr.h

diff --git a/tcg/tci/tcg-target-constr.h b/tcg/tci/tcg-target-constr.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tcg/tci/tcg-target-constr.h
@@ -XXX,XX +XXX,XX @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * TCI target-specific operand constaints.
+ * Copyright (c) 2020 Linaro
+ */
+
+C_O0_I2(r, r)
+C_O0_I2(r, ri)
+C_O0_I2(r, S)
+C_O0_I3(r, r, S)
+C_O0_I3(r, S, S)
+C_O0_I4(r, r, S, S)
+C_O1_I1(r, L)
+C_O1_I1(r, r)
+C_O1_I2(r, 0, r)
+C_O1_I2(r, L, L)
+C_O1_I2(r, ri, ri)
+C_O1_I2(r, r, r)
+C_O1_I2(r, r, ri)
+C_O2_I1(r, r, L)
+C_O2_I2(r, r, L, L)
+
+#if TCG_TARGET_REG_BITS == 32
+C_O0_I4(r, r, ri, ri)
+C_O1_I4(r, r, r, ri, ri)
+C_O2_I2(r, r, r, r)
+C_O2_I4(r, r, r, r, r, r)
+#endif
diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@
 /* Bitfield n...m (in 32 bit value). */
 #define BITS(n, m) (((0xffffffffU << (31 - n)) >> (31 - n + m)) << m)
 
-/* Macros used in tcg_target_op_defs. */
-#define R       "r"
-#define RI      "ri"
-#if TCG_TARGET_REG_BITS == 32
-# define R64    "r", "r"
-#else
-# define R64    "r"
-#endif
-#if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
-# define L      "L", "L"
-# define S      "S", "S"
-#else
-# define L      "L"
-# define S      "S"
-#endif
-
-/* TODO: documentation. */
-static const TCGTargetOpDef tcg_target_op_defs[] = {
-    { INDEX_op_exit_tb, { NULL } },
-    { INDEX_op_goto_tb, { NULL } },
-    { INDEX_op_br, { NULL } },
-
-    { INDEX_op_ld8u_i32, { R, R } },
-    { INDEX_op_ld8s_i32, { R, R } },
-    { INDEX_op_ld16u_i32, { R, R } },
-    { INDEX_op_ld16s_i32, { R, R } },
-    { INDEX_op_ld_i32, { R, R } },
-    { INDEX_op_st8_i32, { R, R } },
-    { INDEX_op_st16_i32, { R, R } },
-    { INDEX_op_st_i32, { R, R } },
-
-    { INDEX_op_add_i32, { R, RI, RI } },
-    { INDEX_op_sub_i32, { R, RI, RI } },
-    { INDEX_op_mul_i32, { R, RI, RI } },
-#if TCG_TARGET_HAS_div_i32
-    { INDEX_op_div_i32, { R, R, R } },
-    { INDEX_op_divu_i32, { R, R, R } },
-    { INDEX_op_rem_i32, { R, R, R } },
-    { INDEX_op_remu_i32, { R, R, R } },
-#elif TCG_TARGET_HAS_div2_i32
-    { INDEX_op_div2_i32, { R, R, "0", "1", R } },
-    { INDEX_op_divu2_i32, { R, R, "0", "1", R } },
-#endif
-    /* TODO: Does R, RI, RI result in faster code than R, R, RI?
-       If both operands are constants, we can optimize. */
-    { INDEX_op_and_i32, { R, RI, RI } },
-#if TCG_TARGET_HAS_andc_i32
-    { INDEX_op_andc_i32, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_eqv_i32
-    { INDEX_op_eqv_i32, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_nand_i32
-    { INDEX_op_nand_i32, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_nor_i32
-    { INDEX_op_nor_i32, { R, RI, RI } },
-#endif
-    { INDEX_op_or_i32, { R, RI, RI } },
-#if TCG_TARGET_HAS_orc_i32
-    { INDEX_op_orc_i32, { R, RI, RI } },
-#endif
-    { INDEX_op_xor_i32, { R, RI, RI } },
-    { INDEX_op_shl_i32, { R, RI, RI } },
-    { INDEX_op_shr_i32, { R, RI, RI } },
-    { INDEX_op_sar_i32, { R, RI, RI } },
-#if TCG_TARGET_HAS_rot_i32
-    { INDEX_op_rotl_i32, { R, RI, RI } },
-    { INDEX_op_rotr_i32, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_deposit_i32
-    { INDEX_op_deposit_i32, { R, "0", R } },
-#endif
-
-    { INDEX_op_brcond_i32, { R, RI } },
-
-    { INDEX_op_setcond_i32, { R, R, RI } },
-#if TCG_TARGET_REG_BITS == 64
-    { INDEX_op_setcond_i64, { R, R, RI } },
-#endif /* TCG_TARGET_REG_BITS == 64 */
-
-#if TCG_TARGET_REG_BITS == 32
-    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
-    { INDEX_op_add2_i32, { R, R, R, R, R, R } },
-    { INDEX_op_sub2_i32, { R, R, R, R, R, R } },
-    { INDEX_op_brcond2_i32, { R, R, RI, RI } },
-    { INDEX_op_mulu2_i32, { R, R, R, R } },
-    { INDEX_op_setcond2_i32, { R, R, R, RI, RI } },
-#endif
-
-#if TCG_TARGET_HAS_not_i32
-    { INDEX_op_not_i32, { R, R } },
-#endif
-#if TCG_TARGET_HAS_neg_i32
-    { INDEX_op_neg_i32, { R, R } },
-#endif
-
-#if TCG_TARGET_REG_BITS == 64
-    { INDEX_op_ld8u_i64, { R, R } },
-    { INDEX_op_ld8s_i64, { R, R } },
-    { INDEX_op_ld16u_i64, { R, R } },
-    { INDEX_op_ld16s_i64, { R, R } },
-    { INDEX_op_ld32u_i64, { R, R } },
-    { INDEX_op_ld32s_i64, { R, R } },
-    { INDEX_op_ld_i64, { R, R } },
-
-    { INDEX_op_st8_i64, { R, R } },
-    { INDEX_op_st16_i64, { R, R } },
-    { INDEX_op_st32_i64, { R, R } },
-    { INDEX_op_st_i64, { R, R } },
-
-    { INDEX_op_add_i64, { R, RI, RI } },
-    { INDEX_op_sub_i64, { R, RI, RI } },
-    { INDEX_op_mul_i64, { R, RI, RI } },
-#if TCG_TARGET_HAS_div_i64
-    { INDEX_op_div_i64, { R, R, R } },
-    { INDEX_op_divu_i64, { R, R, R } },
-    { INDEX_op_rem_i64, { R, R, R } },
-    { INDEX_op_remu_i64, { R, R, R } },
-#elif TCG_TARGET_HAS_div2_i64
-    { INDEX_op_div2_i64, { R, R, "0", "1", R } },
-    { INDEX_op_divu2_i64, { R, R, "0", "1", R } },
-#endif
-    { INDEX_op_and_i64, { R, RI, RI } },
-#if TCG_TARGET_HAS_andc_i64
-    { INDEX_op_andc_i64, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_eqv_i64
-    { INDEX_op_eqv_i64, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_nand_i64
-    { INDEX_op_nand_i64, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_nor_i64
-    { INDEX_op_nor_i64, { R, RI, RI } },
-#endif
-    { INDEX_op_or_i64, { R, RI, RI } },
-#if TCG_TARGET_HAS_orc_i64
-    { INDEX_op_orc_i64, { R, RI, RI } },
-#endif
-    { INDEX_op_xor_i64, { R, RI, RI } },
-    { INDEX_op_shl_i64, { R, RI, RI } },
-    { INDEX_op_shr_i64, { R, RI, RI } },
-    { INDEX_op_sar_i64, { R, RI, RI } },
-#if TCG_TARGET_HAS_rot_i64
-    { INDEX_op_rotl_i64, { R, RI, RI } },
-    { INDEX_op_rotr_i64, { R, RI, RI } },
-#endif
-#if TCG_TARGET_HAS_deposit_i64
-    { INDEX_op_deposit_i64, { R, "0", R } },
-#endif
-    { INDEX_op_brcond_i64, { R, RI } },
-
-#if TCG_TARGET_HAS_ext8s_i64
-    { INDEX_op_ext8s_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext16s_i64
-    { INDEX_op_ext16s_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext32s_i64
-    { INDEX_op_ext32s_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext8u_i64
-    { INDEX_op_ext8u_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext16u_i64
-    { INDEX_op_ext16u_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext32u_i64
-    { INDEX_op_ext32u_i64, { R, R } },
-#endif
-    { INDEX_op_ext_i32_i64, { R, R } },
-    { INDEX_op_extu_i32_i64, { R, R } },
-#if TCG_TARGET_HAS_bswap16_i64
-    { INDEX_op_bswap16_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_bswap32_i64
-    { INDEX_op_bswap32_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_bswap64_i64
-    { INDEX_op_bswap64_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_not_i64
-    { INDEX_op_not_i64, { R, R } },
-#endif
-#if TCG_TARGET_HAS_neg_i64
-    { INDEX_op_neg_i64, { R, R } },
-#endif
-#endif /* TCG_TARGET_REG_BITS == 64 */
-
-    { INDEX_op_qemu_ld_i32, { R, L } },
-    { INDEX_op_qemu_ld_i64, { R64, L } },
-
-    { INDEX_op_qemu_st_i32, { R, S } },
-    { INDEX_op_qemu_st_i64, { R64, S } },
-
-#if TCG_TARGET_HAS_ext8s_i32
-    { INDEX_op_ext8s_i32, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext16s_i32
-    { INDEX_op_ext16s_i32, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext8u_i32
-    { INDEX_op_ext8u_i32, { R, R } },
-#endif
-#if TCG_TARGET_HAS_ext16u_i32
-    { INDEX_op_ext16u_i32, { R, R } },
-#endif
-
-#if TCG_TARGET_HAS_bswap16_i32
-    { INDEX_op_bswap16_i32, { R, R } },
-#endif
-#if TCG_TARGET_HAS_bswap32_i32
-    { INDEX_op_bswap32_i32, { R, R } },
-#endif
-
-    { INDEX_op_mb, { } },
-    { -1 },
-};
+/* Define all constraint sets. */
+#include "../tcg-constr.c.inc"
 
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
-    int i, n = ARRAY_SIZE(tcg_target_op_defs);
+    switch (op) {
+    case INDEX_op_ld8u_i32:
+    case INDEX_op_ld8s_i32:
+    case INDEX_op_ld16u_i32:
+    case INDEX_op_ld16s_i32:
+    case INDEX_op_ld_i32:
+    case INDEX_op_ld8u_i64:
+    case INDEX_op_ld8s_i64:
+    case INDEX_op_ld16u_i64:
+    case INDEX_op_ld16s_i64:
+    case INDEX_op_ld32u_i64:
+    case INDEX_op_ld32s_i64:
+    case INDEX_op_ld_i64:
+    case INDEX_op_not_i32:
+    case INDEX_op_not_i64:
+    case INDEX_op_neg_i32:
+    case INDEX_op_neg_i64:
+    case INDEX_op_ext8s_i32:
+    case INDEX_op_ext8s_i64:
+    case INDEX_op_ext16s_i32:
+    case INDEX_op_ext16s_i64:
+    case INDEX_op_ext8u_i32:
+    case INDEX_op_ext8u_i64:
+    case INDEX_op_ext16u_i32:
+    case INDEX_op_ext16u_i64:
+    case INDEX_op_ext32s_i64:
+    case INDEX_op_ext32u_i64:
+    case INDEX_op_ext_i32_i64:
+    case INDEX_op_extu_i32_i64:
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+    case INDEX_op_bswap64_i64:
+        return C_O1_I1(r, r);
 
-    for (i = 0; i < n; ++i) {
-        if (tcg_target_op_defs[i].op == op) {
-            return &tcg_target_op_defs[i];
-        }
+    case INDEX_op_st8_i32:
+    case INDEX_op_st16_i32:
+    case INDEX_op_st_i32:
+    case INDEX_op_st8_i64:
+    case INDEX_op_st16_i64:
+    case INDEX_op_st32_i64:
+    case INDEX_op_st_i64:
+        return C_O0_I2(r, r);
+
+    case INDEX_op_div_i32:
+    case INDEX_op_div_i64:
+    case INDEX_op_divu_i32:
+    case INDEX_op_divu_i64:
+    case INDEX_op_rem_i32:
+    case INDEX_op_rem_i64:
+    case INDEX_op_remu_i32:
+    case INDEX_op_remu_i64:
+        return C_O1_I2(r, r, r);
+
+    case INDEX_op_add_i32:
+    case INDEX_op_add_i64:
+    case INDEX_op_sub_i32:
+    case INDEX_op_sub_i64:
+    case INDEX_op_mul_i32:
+    case INDEX_op_mul_i64:
+    case INDEX_op_and_i32:
+    case INDEX_op_and_i64:
+    case INDEX_op_andc_i32:
+    case INDEX_op_andc_i64:
+    case INDEX_op_eqv_i32:
+    case INDEX_op_eqv_i64:
+    case INDEX_op_nand_i32:
+    case INDEX_op_nand_i64:
+    case INDEX_op_nor_i32:
+    case INDEX_op_nor_i64:
+    case INDEX_op_or_i32:
+    case INDEX_op_or_i64:
+    case INDEX_op_orc_i32:
+    case INDEX_op_orc_i64:
+    case INDEX_op_xor_i32:
+    case INDEX_op_xor_i64:
+    case INDEX_op_shl_i32:
+    case INDEX_op_shl_i64:
+    case INDEX_op_shr_i32:
+    case INDEX_op_shr_i64:
+    case INDEX_op_sar_i32:
+    case INDEX_op_sar_i64:
+    case INDEX_op_rotl_i32:
+    case INDEX_op_rotl_i64:
+    case INDEX_op_rotr_i32:
+    case INDEX_op_rotr_i64:
+        /* TODO: Does R, RI, RI result in faster code than R, R, RI? */
+        return C_O1_I2(r, ri, ri);
+
+    case INDEX_op_deposit_i32:
+    case INDEX_op_deposit_i64:
+        return C_O1_I2(r, 0, r);
+
+    case INDEX_op_brcond_i32:
+    case INDEX_op_brcond_i64:
+        return C_O0_I2(r, ri);
+
+    case INDEX_op_setcond_i32:
+    case INDEX_op_setcond_i64:
+        return C_O1_I2(r, r, ri);
+
+#if TCG_TARGET_REG_BITS == 32
+    /* TODO: Support R, R, R, R, RI, RI? Will it be faster? */
+    case INDEX_op_add2_i32:
+    case INDEX_op_sub2_i32:
+        return C_O2_I4(r, r, r, r, r, r);
+    case INDEX_op_brcond2_i32:
+        return C_O0_I4(r, r, ri, ri);
+    case INDEX_op_mulu2_i32:
+        return C_O2_I2(r, r, r, r);
+    case INDEX_op_setcond2_i32
+        return C_O1_I4(r, r, r, ri, ri);
+#endif
+
+    case INDEX_op_qemu_ld_i32:
+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+                ? C_O1_I1(r, L)
+                : C_O1_I2(r, L, L));
+    case INDEX_op_qemu_ld_i64:
+        return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
+                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
+                : C_O2_I2(r, r, L, L));
+    case INDEX_op_qemu_st_i32:
+        return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
+                ? C_O0_I2(r, S)
+                : C_O0_I3(r, S, S));
+    case INDEX_op_qemu_st_i64:
+        return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(r, S)
+                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(r, r, S)
+                : C_O0_I4(r, r, S, S));
+
+    default:
+        return NULL;
     }
-    return NULL;
 }
 
 static const int tcg_target_reg_alloc_order[] = {
-- 
2.25.1

The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:

Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027

for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:

tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)

----------------------------------------------------------------
Improvements to qemu/int128
Fixes for 128/64 division.
Cleanup tcg/optimize.c
Optimize redundant sign extensions

----------------------------------------------------------------
Frédéric Pétrot (1):
      qemu/int128: Add int128_{not,xor}

Luis Pires (4):
      host-utils: move checks out of divu128/divs128
      host-utils: move udiv_qrnnd() to host-utils
      host-utils: add 128-bit quotient support to divu128/divs128
      host-utils: add unit tests for divu128/divs128

Richard Henderson (51):
      tcg/optimize: Rename "mask" to "z_mask"
      tcg/optimize: Split out OptContext
      tcg/optimize: Remove do_default label
      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
      tcg/optimize: Move prev_mb into OptContext
      tcg/optimize: Split out init_arguments
      tcg/optimize: Split out copy_propagate
      tcg/optimize: Split out fold_call
      tcg/optimize: Drop nb_oargs, nb_iargs locals
      tcg/optimize: Change fail return for do_constant_folding_cond*
      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
      tcg/optimize: Split out finish_folding
      tcg/optimize: Use a boolean to avoid a mass of continues
      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
      tcg/optimize: Split out fold_const{1,2}
      tcg/optimize: Split out fold_setcond2
      tcg/optimize: Split out fold_brcond2
      tcg/optimize: Split out fold_brcond
      tcg/optimize: Split out fold_setcond
      tcg/optimize: Split out fold_mulu2_i32
      tcg/optimize: Split out fold_addsub2_i32
      tcg/optimize: Split out fold_movcond
      tcg/optimize: Split out fold_extract2
      tcg/optimize: Split out fold_extract, fold_sextract
      tcg/optimize: Split out fold_deposit
      tcg/optimize: Split out fold_count_zeros
      tcg/optimize: Split out fold_bswap
      tcg/optimize: Split out fold_dup, fold_dup2
      tcg/optimize: Split out fold_mov
      tcg/optimize: Split out fold_xx_to_i
      tcg/optimize: Split out fold_xx_to_x
      tcg/optimize: Split out fold_xi_to_i
      tcg/optimize: Add type to OptContext
      tcg/optimize: Split out fold_to_not
      tcg/optimize: Split out fold_sub_to_neg
      tcg/optimize: Split out fold_xi_to_x
      tcg/optimize: Split out fold_ix_to_i
      tcg/optimize: Split out fold_masks
      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
      tcg/optimize: Sink commutative operand swapping into fold functions
      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
      tcg/optimize: Use fold_xx_to_i for orc
      tcg/optimize: Use fold_xi_to_x for mul
      tcg/optimize: Use fold_xi_to_x for div
      tcg/optimize: Use fold_xx_to_i for rem
      tcg/optimize: Optimize sign extensions
      tcg/optimize: Propagate sign info for logical operations
      tcg/optimize: Propagate sign info for setcond
      tcg/optimize: Propagate sign info for bit counting
      tcg/optimize: Propagate sign info for shifting

From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>

Addition of not and xor on 128-bit integers.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
[rth: Split out logical operations.]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/int128.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return a;
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return ~a;
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return a | b;
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return a ^ b;
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return int128_make128(a, (a < 0) ? -1 : 0);
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return int128_make128(~a.lo, ~a.hi);
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     int64_t h;
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

In preparation for changing the divu128/divs128 implementations
to allow for quotients larger than 64 bits, move the div-by-zero
and overflow checks to the callers.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |  5 +++--
 include/qemu/host-utils.h | 34 ++++++++++++---------------------
 target/ppc/int_helper.c   | 14 +++++++++-----
 util/host-utils.c         | 40 ++++++++++++++++++---------------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
         return 0;
     }
     /*
-     * Ignore divu128() return value as we've caught div-by-zero and don't
-     * need different behaviour for overflow.
+     * BUG: when CONFIG_INT128 is not defined, the current implementation of
+     * divu128 does not return a valid truncated quotient, so the result will
+     * be wrong.
      */
     divu128(&lo, &hi, clk->period);
     return lo;
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
-        __uint128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result > UINT64_MAX;
-    }
+    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
+    __uint128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
-        __int128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result != *plow;
-    }
+    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
     uint64_t rt = 0;
     int overflow = 0;
 
-    overflow = divu128(&rt, &ra, rb);
-
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || ra >= rb)) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divu128(&rt, &ra, rb);
     }
 
     if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
     int64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
-    int overflow = divs128(&rt, &ra, rb);
+    int overflow = 0;
 
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divs128(&rt, &ra, rb);
     }
 
     if (oe) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
     *phigh = rh;
 }
 
-/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
-/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
-/* remainder via phigh. */
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+/*
+ * Unsigned 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
     unsigned i;
     uint64_t carry = 0;
 
-    if (divisor == 0) {
-        return 1;
-    } else if (dhi == 0) {
+    if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
         *phigh = dlo % divisor;
-        return 0;
-    } else if (dhi >= divisor) {
-        return 1;
     } else {
 
         for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 
         *plow = dlo;
         *phigh = dhi;
-        return 0;
     }
 }
 
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+/*
+ * Signed 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
     int sgn_dvdnd = *phigh < 0;
     int sgn_divsr = divisor < 0;
-    int overflow = 0;
 
     if (sgn_dvdnd) {
         *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
         divisor = 0 - divisor;
     }
 
-    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
+    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 
     if (sgn_dvdnd  ^ sgn_divsr) {
         *plow = 0 - *plow;
     }
-
-    if (!overflow) {
-        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
-            overflow = 1;
-        }
-    }
-
-    return overflow;
 }
 #endif
 
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
so it can be reused by divu128().

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-macros.h | 82 ----------------------------------
 include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
  * so some portions are provided under:
  *  the SoftFloat-2a license
  *  the BSD license
- *  GPL-v2-or-later
  *
  * Any future contributions to this file after December 1st 2014 will be
  * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* Portions of this work are licensed under the terms of the GNU GPL,
- * version 2 or later. See the COPYING file in the top-level directory.
- */
-
 #ifndef FPU_SOFTFLOAT_MACROS_H
 #define FPU_SOFTFLOAT_MACROS_H
 
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 
 }
 
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
- *
- * Licensed under the GPLv2/LGPLv3
- */
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
-                                  uint64_t n0, uint64_t d)
-{
-#if defined(__x86_64__)
-    uint64_t q;
-    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
-    return q;
-#elif defined(__s390x__) && !defined(__clang__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
-    /* From Power ISA 2.06, programming note for divdeu.  */
-    uint64_t q1, q2, Q, r1, r2, R;
-    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
-        : "=&r"(q1), "=r"(q2)
-        : "r"(n1), "r"(n0), "r"(d));
-    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
-    r2 = n0 - (q2 * d);
-    Q = q1 + q2;
-    R = r1 + r2;
-    if (R >= d || R < r2) { /* overflow implies R > d */
-        Q += 1;
-        R -= d;
-    }
-    *r = R;
-    return Q;
-#else
-    uint64_t d0, d1, q0, q1, r1, r0, m;
-
-    d0 = (uint32_t)d;
-    d1 = d >> 32;
-
-    r1 = n1 % d1;
-    q1 = n1 / d1;
-    m = q1 * d0;
-    r1 = (r1 << 32) | (n0 >> 32);
-    if (r1 < m) {
-        q1 -= 1;
-        r1 += d;
-        if (r1 >= d) {
-            if (r1 < m) {
-                q1 -= 1;
-                r1 += d;
-            }
-        }
-    }
-    r1 -= m;
-
-    r0 = r1 % d1;
-    q0 = r1 / d1;
-    m = q0 * d0;
-    r0 = (r0 << 32) | (uint32_t)n0;
-    if (r0 < m) {
-        q0 -= 1;
-        r0 += d;
-        if (r0 >= d) {
-            if (r0 < m) {
-                q0 -= 1;
-                r0 += d;
-            }
-        }
-    }
-    r0 -= m;
-
-    *r = r0;
-    return (q1 << 32) | q0;
-#endif
-}
-
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
  */
 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
+                                  uint64_t n0, uint64_t d)
+{
+#if defined(__x86_64__)
+    uint64_t q;
+    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
+    return q;
+#elif defined(__s390x__) && !defined(__clang__)
+    /* Need to use a TImode type to get an even register pair for DLGR.  */
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
+    *r = n >> 64;
+    return n;
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
+    /* From Power ISA 2.06, programming note for divdeu.  */
+    uint64_t q1, q2, Q, r1, r2, R;
+    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
+        : "=&r"(q1), "=r"(q2)
+        : "r"(n1), "r"(n0), "r"(d));
+    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
+    r2 = n0 - (q2 * d);
+    Q = q1 + q2;
+    R = r1 + r2;
+    if (R >= d || R < r2) { /* overflow implies R > d */
+        Q += 1;
+        R -= d;
+    }
+    *r = R;
+    return Q;
+#else
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    *r = r0;
+    return (q1 << 32) | q0;
+#endif
+}
+
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

These will be used to implement new decimal floating point
instructions from Power ISA 3.1.

The remainder is now returned directly by divu128/divs128,
freeing up phigh to receive the high 64 bits of the quotient.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |   6 +-
 include/qemu/host-utils.h |  20 ++++--
 target/ppc/int_helper.c   |   9 +--
 util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 60 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
     if (clk->period == 0) {
         return 0;
     }
-    /*
-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
-     * divu128 does not return a valid truncated quotient, so the result will
-     * be wrong.
-     */
+
     divu128(&lo, &hi, clk->period);
     return lo;
 }
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
+                               uint64_t divisor)
 {
     __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
     __uint128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
+                              int64_t divisor)
 {
-    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
     __int128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
 
 uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
 {
-    int64_t rt = 0;
+    uint64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
     int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
     int cr;
     uint64_t lo_value;
     uint64_t hi_value;
+    uint64_t rem;
     ppc_avr_t ret = { .u64 = { 0, 0 } };
 
     if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
          * In that case, we leave r unchanged.
          */
     } else {
-        divu128(&lo_value, &hi_value, 1000000000000000ULL);
+        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 
-        for (i = 1; i < 16; hi_value /= 10, i++) {
-            bcd_put_digit(&ret, hi_value % 10, i);
+        for (i = 1; i < 16; rem /= 10, i++) {
+            bcd_put_digit(&ret, rem % 10, i);
         }
 
         for (; i < 32; lo_value /= 10, i++) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
 }
 
 /*
- * Unsigned 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
-    unsigned i;
-    uint64_t carry = 0;
+    uint64_t rem, dhighest;
+    int sh;
 
     if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
-        *phigh = dlo % divisor;
+        *phigh = 0;
+        return dlo % divisor;
     } else {
+        sh = clz64(divisor);
 
-        for (i = 0; i < 64; i++) {
-            carry = dhi >> 63;
-            dhi = (dhi << 1) | (dlo >> 63);
-            if (carry || (dhi >= divisor)) {
-                dhi -= divisor;
-                carry = 1;
-            } else {
-                carry = 0;
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
             }
-            dlo = (dlo << 1) | carry;
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
         }
 
-        *plow = dlo;
-        *phigh = dhi;
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
     }
 }
 
 /*
- * Signed 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    int sgn_dvdnd = *phigh < 0;
-    int sgn_divsr = divisor < 0;
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
 
-    if (sgn_dvdnd) {
-        *plow = ~(*plow);
-        *phigh = ~(*phigh);
-        if (*plow == (int64_t)-1) {
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
             *plow = 0;
-            (*phigh)++;
-         } else {
-            (*plow)++;
-         }
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
     }
 
-    if (sgn_divsr) {
-        divisor = 0 - divisor;
-    }
-
-    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
-
-    if (sgn_dvdnd  ^ sgn_divsr) {
-        *plow = 0 - *plow;
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
     }
 }
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
 tests/unit/meson.build   |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 tests/unit/test-div128.c

diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test 128-bit division functions
+ *
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+typedef struct {
+    uint64_t high;
+    uint64_t low;
+    uint64_t rhigh;
+    uint64_t rlow;
+    uint64_t divisor;
+    uint64_t remainder;
+} test_data_unsigned;
+
+typedef struct {
+    int64_t high;
+    uint64_t low;
+    int64_t rhigh;
+    uint64_t rlow;
+    int64_t divisor;
+    int64_t remainder;
+} test_data_signed;
+
+static const test_data_unsigned test_table_unsigned[] = {
+    /* Dividend fits in 64 bits */
+    { 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000003ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000002ULL, 0x0000000000000001ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0xa000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000002ULL,
+      0x4000000000000000ULL, 0x2000000000000000ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x8000000000000000ULL, 0x0000000000000000ULL},
+
+    /* Dividend > 64 bits, with MSB 0 */
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x000000000000000dULL,
+      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
+      0x0000000000000010ULL, 0x0000000000000001ULL},
+
+    /* Dividend > 64 bits, with MSB 1 */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
+      0x0000000000000010ULL, 0x000000000000000fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
+      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
+
+    /**
+     * Divisor == 64 bits, with MSB 1
+     * and high 64 bits of dividend >= divisor
+     * (for testing normalization)
+     */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0xfddbb9977553310aULL,
+      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
+
+    /* Dividend > 64 bits, divisor almost as big */
+    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
+      0x0000000000000000ULL, 0x000000000000000fULL,
+      0x123456789abcdefeULL, 0x123456789abcde1fULL},
+};
+
+static const test_data_signed test_table_signed[] = {
+    /* Positive dividend, positive/negative divisors */
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0x0000000000000008LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0xfffffffffffffff8LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0x0000000000000237LL, 0x0000000000000183LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0xfffffffffffffdc9LL, 0x0000000000000183LL},
+
+    /* Negative dividend, positive/negative divisors */
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0x0000000000000008LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0x0000000000000237LL, 0xfffffffffffffe7dLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
+};
+
+static void test_divu128(void)
+{
+    int i;
+    uint64_t rem;
+    test_data_unsigned tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
+        tmp = test_table_unsigned[i];
+
+        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+static void test_divs128(void)
+{
+    int i;
+    int64_t rem;
+    test_data_signed tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
+        tmp = test_table_signed[i];
+
+        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/host-utils/test_divu128", test_divu128);
+    g_test_add_func("/host-utils/test_divs128", test_divs128);
+    return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
   # all code tested by test-x86-cpuid is inside topology.h
   'test-x86-cpuid': [],
   'test-cutils': [],
+  'test-div128': [],
   'test-shift128': [],
   'test-mul64': [],
   # all code tested by test-int128 is inside int128.h
-- 
2.25.1

Prepare for tracking different masks by renaming this one.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     uint64_t val;
-    uint64_t mask;
+    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->next_copy = ts;
     ti->prev_copy = ts;
     ti->is_const = false;
-    ti->mask = -1;
+    ti->z_mask = -1;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
-        ti->mask = ts->val;
+        ti->z_mask = ts->val;
         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
             /* High bits of a 32-bit quantity are garbage.  */
-            ti->mask |= ~0xffffffffull;
+            ti->z_mask |= ~0xffffffffull;
         }
     } else {
         ti->is_const = false;
-        ti->mask = -1;
+        ti->z_mask = -1;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t mask;
+    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    mask = si->mask;
+    z_mask = si->z_mask;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
         /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
+        z_mask |= ~0xffffffffull;
     }
-    di->mask = mask;
+    di->z_mask = z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t mask, partmask, affected, tmp;
+        uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
-        mask = -1;
+        z_mask = -1;
         affected = -1;
         switch (opc) {
         CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext8u):
-            mask = 0xff;
+            z_mask = 0xff;
             goto and_const;
         CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             goto and_const;
         case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_ext32u_i64:
-            mask = 0xffffffffU;
+            z_mask = 0xffffffffU;
             goto and_const;
 
         CASE_OP_32_64(and):
-            mask = arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[2])->z_mask;
             if (arg_is_const(op->args[2])) {
         and_const:
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
-            mask = arg_info(op->args[1])->mask & mask;
+            z_mask = arg_info(op->args[1])->z_mask & z_mask;
             break;
 
         case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_extu_i32_i64:
             /* We do not compute affected as it is a size changing op.  */
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
 
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                op->args[2] is constant, we can't infer anything from it.  */
             if (arg_is_const(op->args[2])) {
-                mask = ~arg_info(op->args[2])->mask;
+                z_mask = ~arg_info(op->args[2])->z_mask;
                 goto and_const;
             }
             /* But we certainly know nothing outside args[1] may be set. */
-            mask = arg_info(op->args[1])->mask;
+            z_mask = arg_info(op->args[1])->z_mask;
             break;
 
         case INDEX_op_sar_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_extrl_i64_i32:
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
         case INDEX_op_extrh_i64_i32:
-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
+            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                mask = arg_info(op->args[1])->mask << tmp;
+                z_mask = arg_info(op->args[1])->z_mask << tmp;
             }
             break;
 
         CASE_OP_32_64(neg):
             /* Set to 1 all bits to the left of the rightmost.  */
-            mask = -(arg_info(op->args[1])->mask
-                     & -arg_info(op->args[1])->mask);
+            z_mask = -(arg_info(op->args[1])->z_mask
+                       & -arg_info(op->args[1])->z_mask);
             break;
 
         CASE_OP_32_64(deposit):
-            mask = deposit64(arg_info(op->args[1])->mask,
-                             op->args[3], op->args[4],
-                             arg_info(op->args[2])->mask);
+            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                               op->args[3], op->args[4],
+                               arg_info(op->args[2])->z_mask);
             break;
 
         CASE_OP_32_64(extract):
-            mask = extract64(arg_info(op->args[1])->mask,
-                             op->args[2], op->args[3]);
+            z_mask = extract64(arg_info(op->args[1])->z_mask,
+                               op->args[2], op->args[3]);
             if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
         CASE_OP_32_64(sextract):
-            mask = sextract64(arg_info(op->args[1])->mask,
-                              op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+                                op->args[2], op->args[3]);
+            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
 
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[1])->z_mask
+                   | arg_info(op->args[2])->z_mask;
             break;
 
         case INDEX_op_clz_i32:
         case INDEX_op_ctz_i32:
-            mask = arg_info(op->args[2])->mask | 31;
+            z_mask = arg_info(op->args[2])->z_mask | 31;
             break;
 
         case INDEX_op_clz_i64:
         case INDEX_op_ctz_i64:
-            mask = arg_info(op->args[2])->mask | 63;
+            z_mask = arg_info(op->args[2])->z_mask | 63;
             break;
 
         case INDEX_op_ctpop_i32:
-            mask = 32 | 31;
+            z_mask = 32 | 31;
             break;
         case INDEX_op_ctpop_i64:
-            mask = 64 | 63;
+            z_mask = 64 | 63;
             break;
 
         CASE_OP_32_64(setcond):
         case INDEX_op_setcond2_i32:
-            mask = 1;
+            z_mask = 1;
             break;
 
         CASE_OP_32_64(movcond):
-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
+            z_mask = arg_info(op->args[3])->z_mask
+                   | arg_info(op->args[4])->z_mask;
             break;
 
         CASE_OP_32_64(ld8u):
-            mask = 0xff;
+            z_mask = 0xff;
             break;
         CASE_OP_32_64(ld16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             break;
         case INDEX_op_ld32u_i64:
-            mask = 0xffffffffu;
+            z_mask = 0xffffffffu;
             break;
 
         CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
             }
             break;
 
         CASE_OP_32_64(bswap16):
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffff) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffff) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap16(mask);
+            z_mask = bswap16(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int16_t)mask;
+                z_mask = (int16_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(16, 48);
+                z_mask |= MAKE_64BIT_MASK(16, 48);
                 break;
             }
             break;
 
         case INDEX_op_bswap32_i64:
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffffffffu) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffffffffu) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap32(mask);
+            z_mask = bswap32(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int32_t)mask;
+                z_mask = (int32_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(32, 32);
+                z_mask |= MAKE_64BIT_MASK(32, 32);
                 break;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         /* 32-bit ops generate 32-bit results.  For the result is zero test
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
-        partmask = mask;
+        partmask = z_mask;
         if (!(def->flags & TCG_OPF_64BIT)) {
-            mask |= ~(tcg_target_ulong)0xffffffffu;
+            z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                    vs the high word of the input.  */
             do_setcond_high:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             /* Default case: we know nothing about operation (or were unable
                to compute the operation result) so no propagation is done.
                We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "mask" is
+               block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
                 memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
                     if (i == 0) {
-                        arg_info(op->args[i])->mask = mask;
+                        arg_info(op->args[i])->z_mask = z_mask;
                     }
                 }
             }
-- 
2.25.1

Provide what will become a larger context for splitting
the very large tcg_optimize function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
+typedef struct OptContext {
+    TCGTempSet temps_used;
+} OptContext;
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     TempOptInfo *ti;
 
-    if (test_bit(idx, temps_used->l)) {
+    if (test_bit(idx, ctx->temps_used.l)) {
         return;
     }
-    set_bit(idx, temps_used->l);
+    set_bit(idx, ctx->temps_used.l);
 
     ti = ts->state_ptr;
     if (ti == NULL) {
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(OptContext *ctx, TCGArg arg)
 {
-    init_ts_info(temps_used, arg_temp(arg));
+    init_ts_info(ctx, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                              TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
-    init_ts_info(temps_used, tv);
+    init_ts_info(ctx, tv);
     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TCGTempSet temps_used;
+    OptContext ctx = {};
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(&temps_used, ts);
+                    init_ts_info(&ctx, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&temps_used, op->args[i]);
+                init_arg_info(&ctx, op->args[i]);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, temps_used.l)) {
+                    if (test_bit(i, ctx.temps_used.l)) {
                         reset_ts(&s->temps[i]);
                     }
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Break the final cleanup clause out of the main switch
statement.  When fully folding an opcode to mov/movi,
use "continue" to process the next opcode, else break
to fall into the final cleanup.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 96 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-            break;
+            continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
-                break;
+                continue;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
                 nb_iargs = 1;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(deposit):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract):
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(sextract):
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                     ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(brcond):
             tmp = do_constant_folding_cond(opc, op->args[0],
                                            op->args[1], op->args[2]);
-            if (tmp != 2) {
-                if (tmp) {
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[3];
-                } else {
-                    tcg_op_remove(s, op);
-                }
+            switch (tmp) {
+            case 0:
+                tcg_op_remove(s, op);
+                continue;
+            case 1:
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[3];
                 break;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
-                break;
+                continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                 uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-                    goto do_default;
+                    break;
                 }
                 op->args[3] = cond;
                 op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  : INDEX_op_setcond_i64);
                 nb_iargs = 2;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_mulu2_i32:
             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                             op->args[4]);
-            if (tmp != 2) {
-                if (tmp) {
-            do_brcond_true:
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[5];
-                } else {
+            if (tmp == 0) {
             do_brcond_false:
-                    tcg_op_remove(s, op);
-                }
-            } else if ((op->args[4] == TCG_COND_LT
-                        || op->args[4] == TCG_COND_GE)
-                       && arg_is_const(op->args[2])
-                       && arg_info(op->args[2])->val == 0
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0) {
+                tcg_op_remove(s, op);
+                continue;
+            }
+            if (tmp == 1) {
+            do_brcond_true:
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[5];
+                break;
+            }
+            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+                 && arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
+                op->opc = opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_brcond_false;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_brcond_low:
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_brcond_true;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
             do_setcond_const:
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-            } else if ((op->args[5] == TCG_COND_LT
-                        || op->args[5] == TCG_COND_GE)
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0
-                       && arg_is_const(op->args[4])
-                       && arg_info(op->args[4])->val == 0) {
+                continue;
+            }
+            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0
+                 && arg_is_const(op->args[4])
+                 && arg_info(op->args[4])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_setcond_high;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_setcond_const;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
-        case INDEX_op_call:
-            if (!(tcg_call_flags(op)
+        default:
+            break;
+        }
+
+        /* Some of the folding above can change opc. */
+        opc = op->opc;
+        def = &tcg_op_defs[opc];
+        if (def->flags & TCG_OPF_BB_END) {
+            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+        } else {
+            if (opc == INDEX_op_call &&
+                !(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            goto do_reset_output;
 
-        default:
-        do_default:
-            /* Default case: we know nothing about operation (or were unable
-               to compute the operation result) so no propagation is done.
-               We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "z_mask" is
-               the non-zero bits mask for the first output arg.  */
-            if (def->flags & TCG_OPF_BB_END) {
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-            } else {
-        do_reset_output:
-                for (i = 0; i < nb_oargs; i++) {
-                    reset_temp(op->args[i]);
-                    /* Save the corresponding known-zero bits mask for the
-                       first output argument (only one supported so far). */
-                    if (i == 0) {
-                        arg_info(op->args[i])->z_mask = z_mask;
-                    }
+            for (i = 0; i < nb_oargs; i++) {
+                reset_temp(op->args[i]);
+                /* Save the corresponding known-zero bits mask for the
+                   first output argument (only one supported so far). */
+                if (i == 0) {
+                    arg_info(op->args[i])->z_mask = z_mask;
                 }
             }
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1

Adjust the interface to take the OptContext parameter instead
of TCGContext or both.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 } TempOptInfo;
 
 typedef struct OptContext {
+    TCGContext *tcg;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
-        tcg_op_remove(s, op);
+        tcg_op_remove(ctx->tcg, op);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-                             TCGOp *op, TCGArg dst, uint64_t val)
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                             TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    OptContext ctx = {};
+    OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(or):
         CASE_OP_32_64_VEC(and):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 } else {
-                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                 }
                 continue;
             }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-- 
2.25.1

This will expose the variable to subroutines that
will be broken out of tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 
 typedef struct OptContext {
     TCGContext *tcg;
+    TCGOp *prev_mb;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
-    TCGOp *op, *op_next, *prev_mb = NULL;
+    TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-        if (prev_mb) {
+        if (ctx.prev_mb) {
             switch (opc) {
             case INDEX_op_mb:
                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  * barrier.  This is stricter than specified but for
                  * the purposes of TCG is better than not optimizing.
                  */
-                prev_mb->args[0] |= op->args[0];
+                ctx.prev_mb->args[0] |= op->args[0];
                 tcg_op_remove(s, op);
                 break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
-                prev_mb = NULL;
+                ctx.prev_mb = NULL;
                 break;
             }
         } else if (opc == INDEX_op_mb) {
-            prev_mb = op;
+            ctx.prev_mb = op;
         }
     }
 }
-- 
2.25.1

There was no real reason for calls to have separate code here.
Unify init for calls vs non-calls using the call path, which
handles TCG_CALL_DUMMY_ARG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(OptContext *ctx, TCGArg arg)
-{
-    init_ts_info(ctx, arg_temp(arg));
-}
-
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+{
+    for (int i = 0; i < nb_args; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts) {
+            init_ts_info(ctx, ts);
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (opc == INDEX_op_call) {
             nb_oargs = TCGOP_CALLO(op);
             nb_iargs = TCGOP_CALLI(op);
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                TCGTemp *ts = arg_temp(op->args[i]);
-                if (ts) {
-                    init_ts_info(&ctx, ts);
-                }
-            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&ctx, op->args[i]);
-            }
         }
+        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-- 
2.25.1

Continue splitting tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
     }
 }
 
+static void copy_propagate(OptContext *ctx, TCGOp *op,
+                           int nb_oargs, int nb_iargs)
+{
+    TCGContext *s = ctx->tcg;
+
+    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts && ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(s, ts));
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             nb_iargs = def->nb_iargs;
         }
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-
-        /* Do copy propagation */
-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            TCGTemp *ts = arg_temp(op->args[i]);
-            if (ts && ts_is_copy(ts)) {
-                op->args[i] = temp_arg(find_better_copy(s, ts));
-            }
-        }
+        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
-- 
2.25.1

Calls are special in that they have a variable number
of arguments, and need to be able to clobber globals.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static bool fold_call(OptContext *ctx, TCGOp *op)
+{
+    TCGContext *s = ctx->tcg;
+    int nb_oargs = TCGOP_CALLO(op);
+    int nb_iargs = TCGOP_CALLI(op);
+    int flags, i;
+
+    init_arguments(ctx, op, nb_oargs + nb_iargs);
+    copy_propagate(ctx, op, nb_oargs, nb_iargs);
+
+    /* If the function reads or writes globals, reset temp data. */
+    flags = tcg_call_flags(op);
+    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+        int nb_globals = s->nb_globals;
+
+        for (i = 0; i < nb_globals; i++) {
+            if (test_bit(i, ctx->temps_used.l)) {
+                reset_ts(&ctx->tcg->temps[i]);
+            }
+        }
+    }
+
+    /* Reset temp data for outputs. */
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+    }
+
+    /* Stop optimizing MB across calls. */
+    ctx->prev_mb = NULL;
+    return true;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals, i;
+    int nb_temps, i;
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
        available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
-    nb_globals = s->nb_globals;
-
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
 
-        /* Count the arguments, and initialize the temps that are
-           going to be used */
+        /* Calls are special. */
         if (opc == INDEX_op_call) {
-            nb_oargs = TCGOP_CALLO(op);
-            nb_iargs = TCGOP_CALLI(op);
-        } else {
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
+            fold_call(&ctx, op);
+            continue;
         }
+
+        def = &tcg_op_defs[opc];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
-            if (opc == INDEX_op_call &&
-                !(tcg_call_flags(op)
-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-                for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, ctx.temps_used.l)) {
-                        reset_ts(&s->temps[i]);
-                    }
-                }
-            }
-
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
-            case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
                 ctx.prev_mb = NULL;
                 break;
-- 
2.25.1

Rather than try to keep these up-to-date across folding,
re-read nb_oargs at the end, after re-reading the opcode.

A couple of asserts need dropping, but that will take care
of itself as we split the function further.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t z_mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         def = &tcg_op_defs[opc];
-        nb_oargs = def->nb_oargs;
-        nb_iargs = def->nb_iargs;
-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
+        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(qemu_ld):
             {
-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         if (partmask == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
-                nb_iargs = 1;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = opc = (opc == INDEX_op_movcond_i32
                                  ? INDEX_op_setcond_i32
                                  : INDEX_op_setcond_i64);
-                nb_iargs = 2;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
+            int nb_oargs = def->nb_oargs;
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
-- 
2.25.1

Return -1 instead of 2 for failure, so that we can
use comparisons against 0 for all cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
     }
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                       TCGArg y, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                    TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         case TCG_COND_GEU:
             return 1;
         default:
-            return 2;
+            return -1;
         }
     }
-    return 2;
+    return -1;
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 {
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
         return do_constant_folding_cond_eq(c);
     }
-    return 2;
+    return -1;
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[3]);
-            if (tmp != 2) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[3]);
+            if (i >= 0) {
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             break;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(opc, op->args[0],
-                                           op->args[1], op->args[2]);
-            switch (tmp) {
-            case 0:
+            i = do_constant_folding_cond(opc, op->args[0],
+                                         op->args[1], op->args[2]);
+            if (i == 0) {
                 tcg_op_remove(s, op);
                 continue;
-            case 1:
+            } else if (i > 0) {
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[5]);
-            if (tmp != 2) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[5]);
+            if (i >= 0) {
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_brcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                            op->args[4]);
-            if (tmp == 0) {
+            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                          op->args[4]);
+            if (i == 0) {
             do_brcond_false:
                 tcg_op_remove(s, op);
                 continue;
             }
-            if (tmp == 1) {
+            if (i > 0) {
             do_brcond_true:
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
             }
             break;
 
         case INDEX_op_setcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                            op->args[5]);
-            if (tmp != 2) {
+            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+                                          op->args[5]);
+            if (i >= 0) {
             do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_const;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
             }
-- 
2.25.1

This will allow callers to tail call to these functions
and return true indicating processing complete.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

Copy z_mask into OptContext, for writeback to the
first output within the new function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGContext *tcg;
     TCGOp *prev_mb;
     TCGTempSet temps_used;
+
+    /* In flight values from optimization. */
+    uint64_t z_mask;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_folding(OptContext *ctx, TCGOp *op)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    int i, nb_oargs;
+
+    /*
+     * For an opcode that ends a BB, reset all temp data.
+     * We do no cross-BB optimization.
+     */
+    if (def->flags & TCG_OPF_BB_END) {
+        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        ctx->prev_mb = NULL;
+        return;
+    }
+
+    nb_oargs = def->nb_oargs;
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+        /*
+         * Save the corresponding known-zero bits mask for the
+         * first output argument (only one supported so far).
+         */
+        if (i == 0) {
+            arg_info(op->args[i])->z_mask = ctx->z_mask;
+        }
+    }
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
+        ctx.z_mask = z_mask;
 
         if (partmask == 0) {
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Some of the folding above can change opc. */
-        opc = op->opc;
-        def = &tcg_op_defs[opc];
-        if (def->flags & TCG_OPF_BB_END) {
-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-        } else {
-            int nb_oargs = def->nb_oargs;
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /* Save the corresponding known-zero bits mask for the
-                   first output argument (only one supported so far). */
-                if (i == 0) {
-                    arg_info(op->args[i])->z_mask = z_mask;
-                }
-            }
-        }
+        finish_folding(&ctx, op);
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
+        bool done = false;
 
         /* Calls are special. */
         if (opc == INDEX_op_call) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
+            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        finish_folding(&ctx, op);
+        if (!done) {
+            finish_folding(&ctx, op);
+        }
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

This puts the separate mb optimization into the same framework
as the others.  While fold_qemu_{ld,st} are currently identical,
that won't last as more code gets moved.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mb(OptContext *ctx, TCGOp *op)
+{
+    /* Eliminate duplicate and redundant fence instructions.  */
+    if (ctx->prev_mb) {
+        /*
+         * Merge two barriers of the same type into one,
+         * or a weaker barrier into a stronger one,
+         * or two weaker barriers into a stronger one.
+         *   mb X; mb Y => mb X|Y
+         *   mb; strl => mb; st
+         *   ldaq; mb => ld; mb
+         *   ldaq; strl => ld; mb; st
+         * Other combinations are also merged into a strong
+         * barrier.  This is stricter than specified but for
+         * the purposes of TCG is better than not optimizing.
+         */
+        ctx->prev_mb->args[0] |= op->args[0];
+        tcg_op_remove(ctx->tcg, op);
+    } else {
+        ctx->prev_mb = op;
+    }
+    return true;
+}
+
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        case INDEX_op_mb:
+            done = fold_mb(&ctx, op);
+            break;
+        case INDEX_op_qemu_ld_i32:
+        case INDEX_op_qemu_ld_i64:
+            done = fold_qemu_ld(&ctx, op);
+            break;
+        case INDEX_op_qemu_st_i32:
+        case INDEX_op_qemu_st8_i32:
+        case INDEX_op_qemu_st_i64:
+            done = fold_qemu_st(&ctx, op);
+            break;
+
         default:
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (!done) {
             finish_folding(&ctx, op);
         }
-
-        /* Eliminate duplicate and redundant fence instructions.  */
-        if (ctx.prev_mb) {
-            switch (opc) {
-            case INDEX_op_mb:
-                /* Merge two barriers of the same type into one,
-                 * or a weaker barrier into a stronger one,
-                 * or two weaker barriers into a stronger one.
-                 *   mb X; mb Y => mb X|Y
-                 *   mb; strl => mb; st
-                 *   ldaq; mb => ld; mb
-                 *   ldaq; strl => ld; mb; st
-                 * Other combinations are also merged into a strong
-                 * barrier.  This is stricter than specified but for
-                 * the purposes of TCG is better than not optimizing.
-                 */
-                ctx.prev_mb->args[0] |= op->args[0];
-                tcg_op_remove(s, op);
-                break;
-
-            default:
-                /* Opcodes that end the block stop the optimization.  */
-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-                    break;
-                }
-                /* fallthru */
-            case INDEX_op_qemu_ld_i32:
-            case INDEX_op_qemu_ld_i64:
-            case INDEX_op_qemu_st_i32:
-            case INDEX_op_qemu_st8_i32:
-            case INDEX_op_qemu_st_i64:
-                /* Opcodes that touch guest memory stop the optimization.  */
-                ctx.prev_mb = NULL;
-                break;
-            }
-        } else if (opc == INDEX_op_mb) {
-            ctx.prev_mb = op;
-        }
     }
 }
-- 
2.25.1

Split out a whole bunch of placeholder functions, which are
currently identical.  That won't last as more code gets moved.

Use CASE_32_64_VEC for some logical operators that previously
missed the addition of vectors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 219 insertions(+), 52 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     }
 }
 
+/*
+ * The fold_* functions return true when processing is complete,
+ * usually by folding the operation to a constant or to a copy,
+ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
+ * like collect information about the value produced, for use in
+ * optimizing a subsequent operation.
+ *
+ * These first fold_* functions are all helpers, used by other
+ * folders for more specific operations.
+ */
+
+static bool fold_const1(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = do_constant_folding(op->opc, t, 0);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_const2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = do_constant_folding(op->opc, t1, t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
+/*
+ * These outermost fold_<op> functions are sorted alphabetically.
+ */
+
+static bool fold_add(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_and(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_andc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_divide(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_exts(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_extu(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
 static bool fold_mb(OptContext *ctx, TCGOp *op)
 {
     /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mul(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_nand(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_neg(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_nor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_not(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_or(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_orc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_shift(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_xor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(not):
-        CASE_OP_32_64(neg):
-        CASE_OP_32_64(ext8s):
-        CASE_OP_32_64(ext8u):
-        CASE_OP_32_64(ext16s):
-        CASE_OP_32_64(ext16u):
-        CASE_OP_32_64(ctpop):
-        case INDEX_op_ext32s_i64:
-        case INDEX_op_ext32u_i64:
-        case INDEX_op_ext_i32_i64:
-        case INDEX_op_extu_i32_i64:
-        case INDEX_op_extrl_i64_i32:
-        case INDEX_op_extrh_i64_i32:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
         case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(add):
-        CASE_OP_32_64(sub):
-        CASE_OP_32_64(mul):
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(and):
-        CASE_OP_32_64(xor):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-        CASE_OP_32_64(andc):
-        CASE_OP_32_64(orc):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-        CASE_OP_32_64(div):
-        CASE_OP_32_64(divu):
-        CASE_OP_32_64(rem):
-        CASE_OP_32_64(remu):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        default:
+            break;
+
+        /* ---------------------------------------------------------- */
+        /* Sorted alphabetically by opcode as much as possible. */
+
+        CASE_OP_32_64_VEC(add):
+            done = fold_add(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(and):
+            done = fold_and(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(andc):
+            done = fold_andc(&ctx, op);
+            break;
+        CASE_OP_32_64(ctpop):
+            done = fold_ctpop(&ctx, op);
+            break;
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+            done = fold_divide(&ctx, op);
+            break;
+        CASE_OP_32_64(eqv):
+            done = fold_eqv(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext16s):
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext_i32_i64:
+            done = fold_exts(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16u):
+        case INDEX_op_ext32u_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            done = fold_extu(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(mul):
+            done = fold_mul(&ctx, op);
+            break;
+        CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(muluh):
+            done = fold_mul_highpart(&ctx, op);
+            break;
+        CASE_OP_32_64(nand):
+            done = fold_nand(&ctx, op);
+            break;
+        CASE_OP_32_64(neg):
+            done = fold_neg(&ctx, op);
+            break;
+        CASE_OP_32_64(nor):
+            done = fold_nor(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(not):
+            done = fold_not(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(or):
+            done = fold_or(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(orc):
+            done = fold_orc(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_i32:
         case INDEX_op_qemu_ld_i64:
             done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_qemu_st_i64:
             done = fold_qemu_st(&ctx, op);
             break;
-
-        default:
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
+            done = fold_remainder(&ctx, op);
+            break;
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+            done = fold_shift(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(sub):
+            done = fold_sub(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(xor):
+            done = fold_xor(&ctx, op);
             break;
         }
 
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_setcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
+            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
+            goto do_setcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            goto do_setcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+                                     op->args[4], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            op->args[2] = op->args[3];
+            op->args[3] = cond;
+            op->opc = INDEX_op_setcond_i32;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_setcond_high:
+        op->args[1] = op->args[2];
+        op->args[2] = op->args[4];
+        op->args[3] = cond;
+        op->opc = INDEX_op_setcond_i32;
+        break;
+    }
+    return false;
+
+ do_setcond_const:
+    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_setcond2_i32:
-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                          op->args[5]);
-            if (i >= 0) {
-            do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-                continue;
-            }
-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0
-                 && arg_is_const(op->args[4])
-                 && arg_info(op->args[4])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_setcond_high:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_const;
-                } else if (i > 0) {
-                    goto do_setcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i < 0) {
-                    break;
-                }
-            do_setcond_low:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[2] = op->args[3];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_low;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(shr):
             done = fold_shift(&ctx, op);
             break;
+        case INDEX_op_setcond2_i32:
+            done = fold_setcond2(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[4];
+    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
+    TCGArg label = op->args[5];
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_brcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
+            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
+            goto do_brcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+                                     op->args[2], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            goto do_brcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            op->opc = INDEX_op_brcond_i32;
+            op->args[1] = op->args[2];
+            op->args[2] = cond;
+            op->args[3] = label;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_brcond_high:
+        op->opc = INDEX_op_brcond_i32;
+        op->args[0] = op->args[1];
+        op->args[1] = op->args[3];
+        op->args[2] = cond;
+        op->args[3] = label;
+        break;
+
+    do_brcond_const:
+        if (i == 0) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+        op->opc = INDEX_op_br;
+        op->args[0] = label;
+        break;
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_brcond2_i32:
-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                          op->args[4]);
-            if (i == 0) {
-            do_brcond_false:
-                tcg_op_remove(s, op);
-                continue;
-            }
-            if (i > 0) {
-            do_brcond_true:
-                op->opc = opc = INDEX_op_br;
-                op->args[0] = op->args[5];
-                break;
-            }
-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
-                 && arg_is_const(op->args[2])
-                 && arg_info(op->args[2])->val == 0
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_brcond_high:
-                op->opc = opc = INDEX_op_brcond_i32;
-                op->args[0] = op->args[1];
-                op->args[1] = op->args[3];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i > 0) {
-                    goto do_brcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i < 0) {
-                    break;
-                }
-            do_brcond_low:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_high;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_low;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(andc):
             done = fold_andc(&ctx, op);
             break;
+        case INDEX_op_brcond2_i32:
+            done = fold_brcond2(&ctx, op);
+            break;
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint32_t a = arg_info(op->args[2])->val;
+        uint32_t b = arg_info(op->args[3])->val;
+        uint64_t r = (uint64_t)a * b;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+        return true;
+    }
+    return false;
+}
+
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_mulu2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-                uint32_t a = arg_info(op->args[2])->val;
-                uint32_t b = arg_info(op->args[3])->val;
-                uint64_t r = (uint64_t)a * b;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
+        case INDEX_op_mulu2_i32:
+            done = fold_mulu2_i32(&ctx, op);
+            break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
             break;
-- 
2.25.1

Add two additional helpers, fold_add2_i32 and fold_sub2_i32
which will not be simple wrappers forever.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+        uint32_t al = arg_info(op->args[2])->val;
+        uint32_t ah = arg_info(op->args[3])->val;
+        uint32_t bl = arg_info(op->args[4])->val;
+        uint32_t bh = arg_info(op->args[5])->val;
+        uint64_t a = ((uint64_t)ah << 32) | al;
+        uint64_t b = ((uint64_t)bh << 32) | bl;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        if (add) {
+            a += b;
+        } else {
+            a -= b;
+        }
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+        return true;
+    }
+    return false;
+}
+
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, true);
+}
+
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, false);
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_add2_i32:
-        case INDEX_op_sub2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-                uint32_t al = arg_info(op->args[2])->val;
-                uint32_t ah = arg_info(op->args[3])->val;
-                uint32_t bl = arg_info(op->args[4])->val;
-                uint32_t bh = arg_info(op->args[5])->val;
-                uint64_t a = ((uint64_t)ah << 32) | al;
-                uint64_t b = ((uint64_t)bh << 32) | bl;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                if (opc == INDEX_op_add2_i32) {
-                    a += b;
-                } else {
-                    a -= b;
-                }
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-                continue;
-            }
-            break;
 
         default:
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add2_i32:
+            done = fold_add2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub2_i32:
+            done = fold_sub2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode opc = op->opc;
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+
+    if (i >= 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+    }
+
+    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+        uint64_t tv = arg_info(op->args[3])->val;
+        uint64_t fv = arg_info(op->args[4])->val;
+
+        opc = (opc == INDEX_op_movcond_i32
+               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+
+        if (tv == 1 && fv == 0) {
+            op->opc = opc;
+            op->args[3] = cond;
+        } else if (fv == 1 && tv == 0) {
+            op->opc = opc;
+            op->args[3] = tcg_invert_cond(cond);
+        }
+    }
+    return false;
+}
+
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(movcond):
-            i = do_constant_folding_cond(opc, op->args[1],
-                                         op->args[2], op->args[5]);
-            if (i >= 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-                continue;
-            }
-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                uint64_t tv = arg_info(op->args[3])->val;
-                uint64_t fv = arg_info(op->args[4])->val;
-                TCGCond cond = op->args[5];
-
-                if (fv == 1 && tv == 0) {
-                    cond = tcg_invert_cond(cond);
-                } else if (!(tv == 1 && fv == 0)) {
-                    break;
-                }
-                op->args[3] = cond;
-                op->opc = opc = (opc == INDEX_op_movcond_i32
-                                 ? INDEX_op_setcond_i32
-                                 : INDEX_op_setcond_i64);
-            }
-            break;
-
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(movcond):
+            done = fold_movcond(&ctx, op);
+            break;
         CASE_OP_32_64(mul):
             done = fold_mul(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t v1 = arg_info(op->args[1])->val;
+        uint64_t v2 = arg_info(op->args[2])->val;
+        int shr = op->args[3];
+
+        if (op->opc == INDEX_op_extract2_i64) {
+            v1 >>= shr;
+            v2 <<= 64 - shr;
+        } else {
+            v1 = (uint32_t)v1 >> shr;
+            v2 = (int32_t)v2 << (32 - shr);
+        }
+        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+    }
+    return false;
+}
+
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
     return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract2):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                uint64_t v1 = arg_info(op->args[1])->val;
-                uint64_t v2 = arg_info(op->args[2])->val;
-                int shr = op->args[3];
-
-                if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-                } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-                                    ((uint32_t)v2 << (32 - shr)));
-                }
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract2):
+            done = fold_extract2(&ctx, op);
+            break;
         CASE_OP_32_64(ext8s):
         CASE_OP_32_64(ext16s):
         case INDEX_op_ext32s_i64:
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = extract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = sextract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract):
-            if (arg_is_const(op->args[1])) {
-                tmp = extract64(arg_info(op->args[1])->val,
-                                op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        CASE_OP_32_64(sextract):
-            if (arg_is_const(op->args[1])) {
-                tmp = sextract64(arg_info(op->args[1])->val,
-                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract):
+            done = fold_extract(&ctx, op);
+            break;
         CASE_OP_32_64(extract2):
             done = fold_extract2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_setcond2_i32:
             done = fold_setcond2(&ctx, op);
             break;
+        CASE_OP_32_64(sextract):
+            done = fold_sextract(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     return fold_const1(ctx, op);
 }
 
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
 static bool fold_divide(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(deposit):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = deposit64(arg_info(op->args[1])->val,
-                                op->args[3], op->args[4],
-                                arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
+        CASE_OP_32_64(deposit):
+            done = fold_deposit(&ctx, op);
+            break;
         CASE_OP_32_64(div):
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+
+        t = do_constant_folding(op->opc, t, op->args[2]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(bswap16):
-        CASE_OP_32_64(bswap32):
-        case INDEX_op_bswap64_i64:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          op->args[2]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_brcond2_i32:
             done = fold_brcond2(&ctx, op);
             break;
+        CASE_OP_32_64(bswap16):
+        CASE_OP_32_64(bswap32):
+        case INDEX_op_bswap64_i64:
+            done = fold_bswap(&ctx, op);
+            break;
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             done = fold_count_zeros(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_dup(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+        t = dup_const(TCGOP_VECE(op), t);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
+                               arg_info(op->args[2])->val);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+
+    if (args_are_copies(op->args[1], op->args[2])) {
+        op->opc = INDEX_op_dup_vec;
+        TCGOP_VECE(op) = MO_32;
+    }
+    return false;
+}
+
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             break;
 
-        case INDEX_op_dup_vec:
-            if (arg_is_const(op->args[1])) {
-                tmp = arg_info(op->args[1])->val;
-                tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        case INDEX_op_dup2_vec:
-            assert(TCG_TARGET_REG_BITS == 32);
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0],
-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
-                                           arg_info(op->args[2])->val));
-                continue;
-            } else if (args_are_copies(op->args[1], op->args[2])) {
-                op->opc = INDEX_op_dup_vec;
-                TCGOP_VECE(op) = MO_32;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
             break;
+        case INDEX_op_dup_vec:
+            done = fold_dup(&ctx, op);
+            break;
+        case INDEX_op_dup2_vec:
+            done = fold_dup2(&ctx, op);
+            break;
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
-- 
2.25.1

This is the final entry in the main switch that was in a
different form.  After this, we have the option to convert
the switch into a function dispatch table.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mov(OptContext *ctx, TCGOp *op)
+{
+    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+}
+
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /*
+         * Process each opcode.
+         * Sorted alphabetically by opcode as much as possible.
+         */
         switch (opc) {
-        CASE_OP_32_64_VEC(mov):
-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            break;
-
-        default:
-            break;
-
-        /* ---------------------------------------------------------- */
-        /* Sorted alphabetically by opcode as much as possible. */
-
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64_VEC(mov):
+            done = fold_mov(&ctx, op);
+            break;
         CASE_OP_32_64(movcond):
             done = fold_movcond(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        default:
+            break;
         }
 
         if (!done) {
-- 
2.25.1

Pull the "op r, a, a => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to @i. */
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(xor):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, a => mov r, a" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to identity. */
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
+ *
+ * The ordering of the transformations should be:
+ *   1) those that produce a constant
+ *   2) those that produce a copy
+ *   3) those that produce information about the result value.
  */
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(and):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, 0 => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to @i. */
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             continue;
         }
 
-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            if (arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Compute the type of the operation early.

There are at least 4 places that used a def->flags ladder
to determine the type of the operation being optimized.

There were two places that assumed !TCG_OPF_64BIT means
TCG_TYPE_I32, and so could potentially compute incorrect
results for vector operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;
+    TCGType type;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
-    const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
     uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     reset_ts(dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_mov_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_mov_i64;
-    } else {
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
         new_op = INDEX_op_mov_i32;
+        break;
+    case TCG_TYPE_I64:
+        new_op = INDEX_op_mov_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
+        new_op = INDEX_op_mov_vec;
+        break;
+    default:
+        g_assert_not_reached();
     }
     op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
     op->args[0] = dst;
     op->args[1] = src;
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    const TCGOpDef *def = &tcg_op_defs[op->opc];
-    TCGType type;
-    TCGTemp *tv;
-
-    if (def->flags & TCG_OPF_VECTOR) {
-        type = TCGOP_VECL(op) + TCG_TYPE_V64;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        type = TCG_TYPE_I64;
-    } else {
-        type = TCG_TYPE_I32;
-    }
-
     /* Convert movi to mov with constant temp. */
-    tv = tcg_constant_internal(type, val);
+    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     }
 }
 
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
+                                    uint64_t x, uint64_t y)
 {
-    const TCGOpDef *def = &tcg_op_defs[op];
     uint64_t res = do_constant_folding_2(op, x, y);
-    if (!(def->flags & TCG_OPF_64BIT)) {
+    if (type == TCG_TYPE_I32) {
         res = (int32_t)res;
     }
     return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
  * Return -1 if the condition can't be simplified,
  * and the result of the condition (0 or 1) if it can.
  */
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+static int do_constant_folding_cond(TCGType type, TCGArg x,
                                     TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
 
     if (arg_is_const(x) && arg_is_const(y)) {
-        const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-        if (def->flags & TCG_OPF_64BIT) {
-            return do_constant_folding_cond_64(xv, yv, c);
-        } else {
+        switch (type) {
+        case TCG_TYPE_I32:
             return do_constant_folding_cond_32(xv, yv, c);
+        case TCG_TYPE_I64:
+            return do_constant_folding_cond_64(xv, yv, c);
+        default:
+            /* Only scalar comparisons are optimizable */
+            return -1;
         }
     } else if (args_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = do_constant_folding(op->opc, t, 0);
+        t = do_constant_folding(op->opc, ctx->type, t, 0);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
         uint64_t t1 = arg_info(op->args[1])->val;
         uint64_t t2 = arg_info(op->args[2])->val;
 
-        t1 = do_constant_folding(op->opc, t1, t2);
+        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                      op->args[2], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
             goto do_brcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
-        t = do_constant_folding(op->opc, t, op->args[2]);
+        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         uint64_t t = arg_info(op->args[1])->val;
 
         if (t != 0) {
-            t = do_constant_folding(op->opc, t, 0);
+            t = do_constant_folding(op->opc, ctx->type, t, 0);
             return tcg_opt_gen_movi(ctx, op, op->args[0], t);
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
-    TCGOpcode opc = op->opc;
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
+        TCGOpcode opc;
 
-        opc = (opc == INDEX_op_movcond_i32
-               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+        switch (ctx->type) {
+        case TCG_TYPE_I32:
+            opc = INDEX_op_setcond_i32;
+            break;
+        case TCG_TYPE_I64:
+            opc = INDEX_op_setcond_i64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         if (tv == 1 && fv == 0) {
             op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
             goto do_setcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                      op->args[4], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
         copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
+        /* Pre-compute the type of the operation. */
+        if (def->flags & TCG_OPF_VECTOR) {
+            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
+        } else if (def->flags & TCG_OPF_64BIT) {
+            ctx.type = TCG_TYPE_I64;
+        } else {
+            ctx.type = TCG_TYPE_I32;
+        }
+
         /* For commutative operations make constant second argument */
         switch (opc) {
         CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (opc == INDEX_op_sub_i32) {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
-                } else if (opc == INDEX_op_sub_i64) {
+                    break;
+                case TCG_TYPE_I64:
                     neg_op = INDEX_op_neg_i64;
                     have_neg = TCG_TARGET_HAS_neg_i64;
-                } else if (TCG_TARGET_HAS_neg_vec) {
-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
-                    unsigned vece = TCGOP_VECE(op);
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-                } else {
                     break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    neg_op = INDEX_op_neg_vec;
+                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+                                                   TCGOP_VECE(op)) > 0;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_neg) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode not_op;
                 bool have_not;
 
-                if (def->flags & TCG_OPF_VECTOR) {
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                } else if (def->flags & TCG_OPF_64BIT) {
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                } else {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     not_op = INDEX_op_not_i32;
                     have_not = TCG_TARGET_HAS_not_i32;
+                    break;
+                case TCG_TYPE_I64:
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                    break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    not_op = INDEX_op_not_vec;
+                    have_not = TCG_TARGET_HAS_not_vec;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_not) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
         partmask = z_mask;
-        if (!(def->flags & TCG_OPF_64BIT)) {
+        if (ctx.type == TCG_TYPE_I32) {
             z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
-- 
2.25.1

Split out the conditional conversion from a more complex logical
operation to a simple NOT.  Create a couple more helpers to make
this easy for the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/*
+ * Convert @op to NOT, if NOT is supported by the host.
+ * Return true f the conversion is successful, which will still
+ * indicate that the processing is complete.
+ */
+static bool fold_not(OptContext *ctx, TCGOp *op);
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+{
+    TCGOpcode not_op;
+    bool have_not;
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        not_op = INDEX_op_not_i32;
+        have_not = TCG_TARGET_HAS_not_i32;
+        break;
+    case TCG_TYPE_I64:
+        not_op = INDEX_op_not_i64;
+        have_not = TCG_TARGET_HAS_not_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        not_op = INDEX_op_not_vec;
+        have_not = TCG_TARGET_HAS_not_vec;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_not) {
+        op->opc = not_op;
+        op->args[1] = op->args[idx];
+        return fold_not(ctx, op);
+    }
+    return false;
+}
+
+/* If the binary operation has first argument @i, fold to NOT. */
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return fold_to_not(ctx, op, 2);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to @i. */
 static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to NOT. */
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return fold_to_not(ctx, op, 1);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, -1)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_not(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    /* Because of fold_to_not, we want to always return true, via finish. */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_ix_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             }
             break;
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(nand):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64(nor):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(andc):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == -1) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        try_not:
-            {
-                TCGOpcode not_op;
-                bool have_not;
-
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    not_op = INDEX_op_not_i32;
-                    have_not = TCG_TARGET_HAS_not_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_not) {
-                    break;
-                }
-                op->opc = not_op;
-                reset_temp(op->args[0]);
-                op->args[1] = op->args[i];
-                continue;
-            }
         default:
             break;
         }
-- 
2.25.1

Even though there is only one user, place this more complex
conversion into its own helper.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+    /*
+     * Because of fold_sub_to_neg, we want to always return true,
+     * via finish_folding.
+     */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode neg_op;
+    bool have_neg;
+
+    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
+        return false;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        neg_op = INDEX_op_neg_i32;
+        have_neg = TCG_TARGET_HAS_neg_i32;
+        break;
+    case TCG_TYPE_I64:
+        neg_op = INDEX_op_neg_i64;
+        have_neg = TCG_TARGET_HAS_neg_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        neg_op = INDEX_op_neg_vec;
+        have_neg = (TCG_TARGET_HAS_neg_vec &&
+                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_neg) {
+        op->opc = neg_op;
+        op->args[1] = op->args[2];
+        return fold_neg(ctx, op);
+    }
+    return false;
+}
+
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 continue;
             }
             break;
-        CASE_OP_32_64_VEC(sub):
-            {
-                TCGOpcode neg_op;
-                bool have_neg;
-
-                if (arg_is_const(op->args[2])) {
-                    /* Proceed with possible constant folding. */
-                    break;
-                }
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    neg_op = INDEX_op_neg_i32;
-                    have_neg = TCG_TARGET_HAS_neg_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    neg_op = INDEX_op_neg_i64;
-                    have_neg = TCG_TARGET_HAS_neg_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-                                                   TCGOP_VECE(op)) > 0;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_neg) {
-                    break;
-                }
-                if (arg_is_const(op->args[1])
-                    && arg_info(op->args[1])->val == 0) {
-                    op->opc = neg_op;
-                    reset_temp(op->args[0]);
-                    op->args[1] = op->args[2];
-                    continue;
-                }
-            }
-            break;
         default:
             break;
         }
-- 
2.25.1

Pull the "op r, a, i => mov r, a" optimization into a function,
and use them in the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to identity. */
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to NOT. */
 static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Pull the "op r, 0, b => movi r, 0" optimization into a function,
and use it in fold_shift.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
     return false;
 }
 
+/* If the binary operation has first argument @i, fold to @i. */
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has first argument @i, fold to NOT. */
 static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
-           and "sub r, 0, a => neg r, a" case.  */
-        switch (opc) {
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Move all of the known-zero optimizations into the per-opcode
functions.  Use fold_masks when there is a possibility of the
result being determined, and simply set ctx->z_mask otherwise.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 251 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGTempSet temps_used;
 
     /* In flight values from optimization. */
-    uint64_t z_mask;
+    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     TCGType type;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    uint64_t a_mask = ctx->a_mask;
+    uint64_t z_mask = ctx->z_mask;
+
+    /*
+     * 32-bit ops generate 32-bit results.  For the result is zero test
+     * below, we can ignore high bits, but for further optimizations we
+     * need to record that the high bits contain garbage.
+     */
+    if (ctx->type == TCG_TYPE_I32) {
+        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+        a_mask &= MAKE_64BIT_MASK(0, 32);
+        z_mask &= MAKE_64BIT_MASK(0, 32);
+    }
+
+    if (z_mask == 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+    }
+    if (a_mask == 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * Convert @op to NOT, if NOT is supported by the host.
  * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1, z2;
+
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+    z2 = arg_info(op->args[2])->z_mask;
+    ctx->z_mask = z1 & z2;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer affected bits from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        ctx->a_mask = z1 & ~z2;
+    }
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer anything from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+        ctx->a_mask = z1 & ~z2;
+        z1 &= z2;
+    }
+    ctx->z_mask = z1;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, sign;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask = arg_info(op->args[1])->z_mask;
+    switch (op->opc) {
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+        z_mask = bswap16(z_mask);
+        sign = INT16_MIN;
+        break;
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+        z_mask = bswap32(z_mask);
+        sign = INT32_MIN;
+        break;
+    case INDEX_op_bswap64_i64:
+        z_mask = bswap64(z_mask);
+        sign = INT64_MIN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+    case TCG_BSWAP_OZ:
+        break;
+    case TCG_BSWAP_OS:
+        /* If the sign bit may be 1, force all the bits above to 1. */
+        if (z_mask & sign) {
+            z_mask |= sign;
+        }
+        break;
+    default:
+        /* The high bits are undefined: force all bits above the sign to 1. */
+        z_mask |= sign << 1;
+        break;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
     }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        z_mask = 31;
+        break;
+    case TCG_TYPE_I64:
+        z_mask = 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+
     return false;
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        ctx->z_mask = 32 | 31;
+        break;
+    case TCG_TYPE_I64:
+        ctx->z_mask = 64 | 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         t1 = deposit64(t1, op->args[3], op->args[4], t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
+
+    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                            op->args[3], op->args[4],
+                            arg_info(op->args[2])->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         t = extract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask, sign;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8s):
+        sign = INT8_MIN;
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16s):
+        sign = INT16_MIN;
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_ext_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32s_i64:
+        sign = INT32_MIN;
+        z_mask = (uint32_t)z_mask;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (z_mask & sign) {
+        z_mask |= sign;
+    } else if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8u):
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16u):
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extu_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32u_i64:
+        z_mask = (uint32_t)z_mask;
+        break;
+    case INDEX_op_extrh_i64_i32:
+        type_change = true;
+        z_mask >>= 32;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ctx->z_mask = z_mask;
+    if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    return fold_masks(ctx, op);
 }
 
 static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
+    ctx->z_mask = arg_info(op->args[3])->z_mask
+                | arg_info(op->args[4])->z_mask;
+
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
+
+    /* Set to 1 all bits to the left of the rightmost.  */
+    z_mask = arg_info(op->args[1])->z_mask;
+    ctx->z_mask = -(z_mask & -z_mask);
+
     /*
      * Because of fold_sub_to_neg, we want to always return true,
      * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+    MemOp mop = get_memop(oi);
+    int width = 8 * memop_size(mop);
+
+    if (!(mop & MO_SIGN) && width < 64) {
+        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    }
+
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
+
+    ctx->z_mask = 1;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
         op->opc = INDEX_op_setcond_i32;
         break;
     }
+
+    ctx->z_mask = 1;
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
+    int64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
         t = sextract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0 && z_mask >= 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
+
+    if (arg_is_const(op->args[2])) {
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
+                                          arg_info(op->args[1])->z_mask,
+                                          arg_info(op->args[2])->val);
+        return fold_masks(ctx, op);
+    }
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
     return fold_addsub2_i32(ctx, op, false);
 }
 
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+{
+    /* We can't do any folding with a load, but we can record bits. */
+    switch (op->opc) {
+    CASE_OP_32_64(ld8u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        break;
+    CASE_OP_32_64(ld16u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        break;
+    case INDEX_op_ld32u_i64:
+        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
         bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify using known-zero bits. Currently only ops with a single
-           output argument is supported. */
-        z_mask = -1;
-        affected = -1;
-        switch (opc) {
-        CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext8u):
-            z_mask = 0xff;
-            goto and_const;
-        CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext16u):
-            z_mask = 0xffff;
-            goto and_const;
-        case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_ext32u_i64:
-            z_mask = 0xffffffffU;
-            goto and_const;
-
-        CASE_OP_32_64(and):
-            z_mask = arg_info(op->args[2])->z_mask;
-            if (arg_is_const(op->args[2])) {
-        and_const:
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-            break;
-
-        case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_extu_i32_i64:
-            /* We do not compute affected as it is a size changing op.  */
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-
-        CASE_OP_32_64(andc):
-            /* Known-zeros does not imply known-ones.  Therefore unless
-               op->args[2] is constant, we can't infer anything from it.  */
-            if (arg_is_const(op->args[2])) {
-                z_mask = ~arg_info(op->args[2])->z_mask;
-                goto and_const;
-            }
-            /* But we certainly know nothing outside args[1] may be set. */
-            z_mask = arg_info(op->args[1])->z_mask;
-            break;
-
-        case INDEX_op_sar_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_sar_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_shr_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_shr_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_extrl_i64_i32:
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-        case INDEX_op_extrh_i64_i32:
-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-            break;
-
-        CASE_OP_32_64(shl):
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                z_mask = arg_info(op->args[1])->z_mask << tmp;
-            }
-            break;
-
-        CASE_OP_32_64(neg):
-            /* Set to 1 all bits to the left of the rightmost.  */
-            z_mask = -(arg_info(op->args[1])->z_mask
-                       & -arg_info(op->args[1])->z_mask);
-            break;
-
-        CASE_OP_32_64(deposit):
-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                               op->args[3], op->args[4],
-                               arg_info(op->args[2])->z_mask);
-            break;
-
-        CASE_OP_32_64(extract):
-            z_mask = extract64(arg_info(op->args[1])->z_mask,
-                               op->args[2], op->args[3]);
-            if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-        CASE_OP_32_64(sextract):
-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-                                op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(xor):
-            z_mask = arg_info(op->args[1])->z_mask
-                   | arg_info(op->args[2])->z_mask;
-            break;
-
-        case INDEX_op_clz_i32:
-        case INDEX_op_ctz_i32:
-            z_mask = arg_info(op->args[2])->z_mask | 31;
-            break;
-
-        case INDEX_op_clz_i64:
-        case INDEX_op_ctz_i64:
-            z_mask = arg_info(op->args[2])->z_mask | 63;
-            break;
-
-        case INDEX_op_ctpop_i32:
-            z_mask = 32 | 31;
-            break;
-        case INDEX_op_ctpop_i64:
-            z_mask = 64 | 63;
-            break;
-
-        CASE_OP_32_64(setcond):
-        case INDEX_op_setcond2_i32:
-            z_mask = 1;
-            break;
-
-        CASE_OP_32_64(movcond):
-            z_mask = arg_info(op->args[3])->z_mask
-                   | arg_info(op->args[4])->z_mask;
-            break;
-
-        CASE_OP_32_64(ld8u):
-            z_mask = 0xff;
-            break;
-        CASE_OP_32_64(ld16u):
-            z_mask = 0xffff;
-            break;
-        case INDEX_op_ld32u_i64:
-            z_mask = 0xffffffffu;
-            break;
-
-        CASE_OP_32_64(qemu_ld):
-            {
-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-                MemOp mop = get_memop(oi);
-                if (!(mop & MO_SIGN)) {
-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-                }
-            }
-            break;
-
-        CASE_OP_32_64(bswap16):
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffff) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap16(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int16_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(16, 48);
-                break;
-            }
-            break;
-
-        case INDEX_op_bswap32_i64:
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffffffffu) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap32(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int32_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(32, 32);
-                break;
-            }
-            break;
-
-        default:
-            break;
-        }
-
-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-           below, we can ignore high bits, but for further optimizations we
-           need to record that the high bits contain garbage.  */
-        partmask = z_mask;
-        if (ctx.type == TCG_TYPE_I32) {
-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-            partmask &= 0xffffffffu;
-            affected &= 0xffffffffu;
-        }
-        ctx.z_mask = z_mask;
-
-        if (partmask == 0) {
-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-            continue;
-        }
-        if (affected == 0) {
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
-        }
+        /* Assume all bits affected, and no bits known zero. */
+        ctx.a_mask = -1;
+        ctx.z_mask = -1;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32u_i64:
+            done = fold_tcg_ld(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.25.1

Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
and muls2_i64.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint32_t a = arg_info(op->args[2])->val;
-        uint32_t b = arg_info(op->args[3])->val;
-        uint64_t r = (uint64_t)a * b;
+        uint64_t a = arg_info(op->args[2])->val;
+        uint64_t b = arg_info(op->args[3])->val;
+        uint64_t h, l;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
+
+        switch (op->opc) {
+        case INDEX_op_mulu2_i32:
+            l = (uint64_t)(uint32_t)a * (uint32_t)b;
+            h = (int32_t)(l >> 32);
+            l = (int32_t)l;
+            break;
+        case INDEX_op_muls2_i32:
+            l = (int64_t)(int32_t)a * (int32_t)b;
+            h = l >> 32;
+            l = (int32_t)l;
+            break;
+        case INDEX_op_mulu2_i64:
+            mulu64(&l, &h, a, b);
+            break;
+        case INDEX_op_muls2_i64:
+            muls64(&l, &h, a, b);
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, l);
+        tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
-        case INDEX_op_mulu2_i32:
-            done = fold_mulu2_i32(&ctx, op);
+        CASE_OP_32_64(muls2):
+        CASE_OP_32_64(mulu2):
+            done = fold_multiply2(&ctx, op);
             break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
-- 
2.25.1

Rename to fold_addsub2.
Use Int128 to implement the wider operation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/int128.h"
 #include "tcg/tcg-op.h"
 #include "tcg-internal.h"
 
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-        uint32_t al = arg_info(op->args[2])->val;
-        uint32_t ah = arg_info(op->args[3])->val;
-        uint32_t bl = arg_info(op->args[4])->val;
-        uint32_t bh = arg_info(op->args[5])->val;
-        uint64_t a = ((uint64_t)ah << 32) | al;
-        uint64_t b = ((uint64_t)bh << 32) | bl;
+        uint64_t al = arg_info(op->args[2])->val;
+        uint64_t ah = arg_info(op->args[3])->val;
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
 
-        if (add) {
-            a += b;
+        if (ctx->type == TCG_TYPE_I32) {
+            uint64_t a = deposit64(al, 32, 32, ah);
+            uint64_t b = deposit64(bl, 32, 32, bh);
+
+            if (add) {
+                a += b;
+            } else {
+                a -= b;
+            }
+
+            al = sextract64(a, 0, 32);
+            ah = sextract64(a, 32, 32);
         } else {
-            a -= b;
+            Int128 a = int128_make128(al, ah);
+            Int128 b = int128_make128(bl, bh);
+
+            if (add) {
+                a = int128_add(a, b);
+            } else {
+                a = int128_sub(a, b);
+            }
+
+            al = int128_getlo(a);
+            ah = int128_gethi(a);
         }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, al);
+        tcg_opt_gen_movi(ctx, op2, rh, ah);
         return true;
     }
     return false;
 }
 
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, true);
+    return fold_addsub2(ctx, op, true);
 }
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, false);
+    return fold_addsub2(ctx, op, false);
 }
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
-        case INDEX_op_add2_i32:
-            done = fold_add2_i32(&ctx, op);
+        CASE_OP_32_64(add2):
+            done = fold_add2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-        case INDEX_op_sub2_i32:
-            done = fold_sub2_i32(&ctx, op);
+        CASE_OP_32_64(sub2):
+            done = fold_sub2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
-- 
2.25.1

Most of these are handled by creating a fold_const2_commutative
to handle all of the binary operators.  The rest were already
handled on a case-by-case basis in the switch, and have their
own fold function in which to place the call.

We now have only one major switch on TCGOpcode.

Introduce NO_DEST and a block comment for swap_commutative in
order to make the handling of brcond and movcond opcodes cleaner.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 1 file changed, 70 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     return -1;
 }
 
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
     return fold_addsub2(ctx, op, true);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     uint64_t z1, z2;
 
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     if (i >= 0) {
         goto do_brcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
         uint64_t a = arg_info(op->args[2])->val;
         uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
     if (i >= 0) {
         goto do_setcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Assume all bits affected, and no bits known zero. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
-- 
2.25.1

This "garbage" setting pre-dates the addition of the type
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
and INDEX_op_extr{l,h}_i64_i32.

So now we have a definitive points at which to adjust z_mask
to eliminate such bits from the 32-bit operands.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *src_ts = arg_temp(src);
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+    TCGTemp *tv;
 
+    if (ctx->type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(ctx->type, val);
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     uint64_t z_mask = ctx->z_mask;
 
     /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        ctx->z_mask = z_mask;
     }
 
     if (z_mask == 0) {
-- 
2.25.1

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1

Sign repetitions are perforce all identical, whether they are 1 or 0.
Bitwise operations preserve the relative quantity of the repetitions.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
     z2 = arg_info(op->args[2])->z_mask;
     ctx->z_mask = z1 & z2;
 
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     }
     ctx->z_mask = z1;
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[3])->z_mask
                 | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
 
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
         return true;
     }
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
     /* Because of fold_to_not, we want to always return true, via finish. */
     finish_folding(ctx, op);
     return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
-- 
2.25.1

For constant shifts, we can simply shift the s_mask.

For variable shifts, we know that sar does not reduce
the s_mask, which helps for sequences like

ext32s_i64  t, in
    sar_i64     t, t, v
    ext32s_i64  out, t

allowing the final extend to be eliminated.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
     return ~(~0ull >> rep);
 }
 
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask, z_mask, sign;
+
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
 
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
     if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
         return fold_masks(ctx, op);
     }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
     return false;
 }
 
-- 
2.25.1