Series comparison

-[PATCH 00/27] tcg patch queue
+[PULL 00/56] tcg patch queue
-Pulling together some cleanups, fixes, and prepatory tci stuff.
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
 Most of this has been reviewed, but not all.
-Those lacking review:
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
--tcg-aarch64-Fix-constant-subtraction-in-tcg_out_adds.patch
+are available in the Git repository at:
 -tcg-aarch64-Fix-I3617_CMLE0.patch
 -tcg-aarch64-Fix-generation-of-scalar-vector-operatio.patch
 -tcg-tci-Use-exec-cpu_ldst.h-interfaces.patch
 -tcg-Manage-splitwx-in-tc_ptr_to_region_tree-by-hand.patch
 -accel-tcg-rename-tb_lookup__cpu_state-and-hoist-stat.patch
 -accel-tcg-move-CF_CLUSTER-calculation-to-curr_cflags.patch
 -accel-tcg-drop-the-use-of-CF_HASH_MASK-and-rename-pa.patch
 -include-exec-lightly-re-arrange-TranslationBlock.patch
 -accel-tcg-Precompute-curr_cflags-into-cpu-tcg_cflags.patch
-Alex, the last patch is a re-write and extension of one that
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
 you did review.
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-r~
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
+----------------------------------------------------------------
+Improvements to qemu/int128
+Fixes for 128/64 division.
+Cleanup tcg/optimize.c
+Optimize redundant sign extensions
-Alex Bennée (4):
+----------------------------------------------------------------
-  accel/tcg: rename tb_lookup__cpu_state and hoist state extraction
+Frédéric Pétrot (1):
-  accel/tcg: move CF_CLUSTER calculation to curr_cflags
+      qemu/int128: Add int128_{not,xor}
   accel/tcg: drop the use of CF_HASH_MASK and rename params
   include/exec: lightly re-arrange TranslationBlock
-Richard Henderson (23):
+Luis Pires (4):
-  tcg/aarch64: Fix constant subtraction in tcg_out_addsub2
+      host-utils: move checks out of divu128/divs128
-  tcg/aarch64: Fix I3617_CMLE0
+      host-utils: move udiv_qrnnd() to host-utils
-  tcg/aarch64: Fix generation of "scalar" vector operations
+      host-utils: add 128-bit quotient support to divu128/divs128
-  tcg/tci: Use exec/cpu_ldst.h interfaces
+      host-utils: add unit tests for divu128/divs128
   tcg: Split out tcg_raise_tb_overflow
   tcg: Manage splitwx in tc_ptr_to_region_tree by hand
   tcg/tci: Merge identical cases in generation (arithmetic opcodes)
   tcg/tci: Merge identical cases in generation (exchange opcodes)
   tcg/tci: Merge identical cases in generation (deposit opcode)
   tcg/tci: Merge identical cases in generation (conditional opcodes)
   tcg/tci: Merge identical cases in generation (load/store opcodes)
   tcg/tci: Remove tci_read_r8
   tcg/tci: Remove tci_read_r8s
   tcg/tci: Remove tci_read_r16
   tcg/tci: Remove tci_read_r16s
   tcg/tci: Remove tci_read_r32
   tcg/tci: Remove tci_read_r32s
   tcg/tci: Reduce use of tci_read_r64
   tcg/tci: Merge basic arithmetic operations
   tcg/tci: Merge extension operations
   tcg/tci: Merge bswap operations
   tcg/tci: Merge mov, not and neg operations
   accel/tcg: Precompute curr_cflags into cpu->tcg_cflags
- accel/tcg/tcg-accel-ops.h       |   1 +
+Richard Henderson (51):
- include/exec/exec-all.h         |  19 +-
+      tcg/optimize: Rename "mask" to "z_mask"
- include/exec/tb-lookup.h        |  26 +-
+      tcg/optimize: Split out OptContext
- include/hw/core/cpu.h           |   2 +
+      tcg/optimize: Remove do_default label
- accel/tcg/cpu-exec.c            |  34 ++-
+      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
- accel/tcg/tcg-accel-ops-mttcg.c |   3 +-
+      tcg/optimize: Move prev_mb into OptContext
- accel/tcg/tcg-accel-ops-rr.c    |   2 +-
+      tcg/optimize: Split out init_arguments
- accel/tcg/tcg-accel-ops.c       |   8 +
+      tcg/optimize: Split out copy_propagate
- accel/tcg/tcg-runtime.c         |   6 +-
+      tcg/optimize: Split out fold_call
- accel/tcg/translate-all.c       |  18 +-
+      tcg/optimize: Drop nb_oargs, nb_iargs locals
- linux-user/main.c               |   1 +
+      tcg/optimize: Change fail return for do_constant_folding_cond*
- linux-user/sh4/signal.c         |   8 +-
+      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
- linux-user/syscall.c            |  18 +-
+      tcg/optimize: Split out finish_folding
- softmmu/physmem.c               |   2 +-
+      tcg/optimize: Use a boolean to avoid a mass of continues
- tcg/tcg.c                       |  29 +-
+      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
- tcg/tci.c                       | 526 ++++++++++----------------------
+      tcg/optimize: Split out fold_const{1,2}
- tcg/aarch64/tcg-target.c.inc    | 229 +++++++++++---
+      tcg/optimize: Split out fold_setcond2
- tcg/tci/tcg-target.c.inc        | 204 +++++--------
+      tcg/optimize: Split out fold_brcond2
-files changed, 526 insertions(+), 610 deletions(-)
+      tcg/optimize: Split out fold_brcond
       tcg/optimize: Split out fold_setcond
       tcg/optimize: Split out fold_mulu2_i32
       tcg/optimize: Split out fold_addsub2_i32
       tcg/optimize: Split out fold_movcond
       tcg/optimize: Split out fold_extract2
       tcg/optimize: Split out fold_extract, fold_sextract
       tcg/optimize: Split out fold_deposit
       tcg/optimize: Split out fold_count_zeros
       tcg/optimize: Split out fold_bswap
       tcg/optimize: Split out fold_dup, fold_dup2
       tcg/optimize: Split out fold_mov
       tcg/optimize: Split out fold_xx_to_i
       tcg/optimize: Split out fold_xx_to_x
       tcg/optimize: Split out fold_xi_to_i
       tcg/optimize: Add type to OptContext
       tcg/optimize: Split out fold_to_not
       tcg/optimize: Split out fold_sub_to_neg
       tcg/optimize: Split out fold_xi_to_x
       tcg/optimize: Split out fold_ix_to_i
       tcg/optimize: Split out fold_masks
       tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
       tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
       tcg/optimize: Sink commutative operand swapping into fold functions
       tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
       tcg/optimize: Use fold_xx_to_i for orc
       tcg/optimize: Use fold_xi_to_x for mul
       tcg/optimize: Use fold_xi_to_x for div
       tcg/optimize: Use fold_xx_to_i for rem
       tcg/optimize: Optimize sign extensions
       tcg/optimize: Propagate sign info for logical operations
       tcg/optimize: Propagate sign info for setcond
       tcg/optimize: Propagate sign info for bit counting
       tcg/optimize: Propagate sign info for shifting
---
+ include/fpu/softfloat-macros.h |   82 --
-.25.1
+ include/hw/clock.h             |    5 +-
  include/qemu/host-utils.h      |  121 +-
  include/qemu/int128.h          |   20 +
  target/ppc/int_helper.c        |   23 +-
  tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
  tests/unit/test-div128.c       |  197 +++
  util/host-utils.c              |  147 ++-
  tests/unit/meson.build         |    1 +
 files changed, 2053 insertions(+), 1187 deletions(-)
  create mode 100644 tests/unit/test-div128.c

-[PATCH 03/27] tcg/aarch64: Fix generation of "scalar" vector operations
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
-For some vector operations, "1D" is not a valid type, and there
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 are separate instructions for the 64-bit scalar operation.
-Tested-by: Stefan Weil <sw@weilnetz.de>
+Addition of not and xor on 128-bit integers.
-Buglink: https://bugs.launchpad.net/qemu/+bug/1916112
-Fixes: 14e4c1e2355 ("tcg/aarch64: Add vector operations")
+Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
 Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
 [rth: Split out logical operations.]
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.c.inc | 211 ++++++++++++++++++++++++++++++-----
+ include/qemu/int128.h | 20 ++++++++++++++++++++
-file changed, 181 insertions(+), 30 deletions(-)
+file changed, 20 insertions(+)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+--- a/include/qemu/int128.h
-+++ b/tcg/aarch64/tcg-target.c.inc
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
-     I3606_BIC       = 0x2f001400,
+     return a;
      I3606_ORR       = 0x0f001400,
 +    /* AdvSIMD scalar shift by immediate */
 +    I3609_SSHR      = 0x5f000400,
 +    I3609_SSRA      = 0x5f001400,
 +    I3609_SHL       = 0x5f005400,
 +    I3609_USHR      = 0x7f000400,
 +    I3609_USRA      = 0x7f001400,
 +    I3609_SLI       = 0x7f005400,
 +
 +    /* AdvSIMD scalar three same */
 +    I3611_SQADD     = 0x5e200c00,
 +    I3611_SQSUB     = 0x5e202c00,
 +    I3611_CMGT      = 0x5e203400,
 +    I3611_CMGE      = 0x5e203c00,
 +    I3611_SSHL      = 0x5e204400,
 +    I3611_ADD       = 0x5e208400,
 +    I3611_CMTST     = 0x5e208c00,
 +    I3611_UQADD     = 0x7e200c00,
 +    I3611_UQSUB     = 0x7e202c00,
 +    I3611_CMHI      = 0x7e203400,
 +    I3611_CMHS      = 0x7e203c00,
 +    I3611_USHL      = 0x7e204400,
 +    I3611_SUB       = 0x7e208400,
 +    I3611_CMEQ      = 0x7e208c00,
 +
 +    /* AdvSIMD scalar two-reg misc */
 +    I3612_CMGT0     = 0x5e208800,
 +    I3612_CMEQ0     = 0x5e209800,
 +    I3612_CMLT0     = 0x5e20a800,
 +    I3612_ABS       = 0x5e20b800,
 +    I3612_CMGE0     = 0x7e208800,
 +    I3612_CMLE0     = 0x7e209800,
 +    I3612_NEG       = 0x7e20b800,
 +
      /* AdvSIMD shift by immediate */
      I3614_SSHR      = 0x0f000400,
      I3614_SSRA      = 0x0f001400,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
                | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
  }
-+static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
++static inline Int128 int128_not(Int128 a)
 +                              TCGReg rd, TCGReg rn, unsigned immhb)
 +{
-+    tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
++    return ~a;
 +}
 +
-+static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
+ static inline Int128 int128_and(Int128 a, Int128 b)
-+                              unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
+ {
      return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
      return a | b;
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
-+    tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
++    return a ^ b;
 +              | (rn & 0x1f) << 5 | (rd & 0x1f));
 +}
 +
-+static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
+ static inline Int128 int128_rshift(Int128 a, int n)
-+                              unsigned size, TCGReg rd, TCGReg rn)
+ {
      return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
      return int128_make128(a, (a < 0) ? -1 : 0);
  }
 +static inline Int128 int128_not(Int128 a)
 +{
-+    tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
++    return int128_make128(~a.lo, ~a.hi);
 +}
 +
- static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
+ static inline Int128 int128_and(Int128 a, Int128 b)
                                TCGReg rd, TCGReg rn, unsigned immhb)
  {
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+     return int128_make128(a.lo & b.lo, a.hi & b.hi);
-                            unsigned vecl, unsigned vece,
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
-                            const TCGArg *args, const int *const_args)
+     return int128_make128(a.lo | b.lo, a.hi | b.hi);
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
 +    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
 +}
 +
  static inline Int128 int128_rshift(Int128 a, int n)
  {
--    static const AArch64Insn cmp_insn[16] = {
+     int64_t h;
 +    static const AArch64Insn cmp_vec_insn[16] = {
          [TCG_COND_EQ] = I3616_CMEQ,
          [TCG_COND_GT] = I3616_CMGT,
          [TCG_COND_GE] = I3616_CMGE,
          [TCG_COND_GTU] = I3616_CMHI,
          [TCG_COND_GEU] = I3616_CMHS,
      };
 -    static const AArch64Insn cmp0_insn[16] = {
 +    static const AArch64Insn cmp_scalar_insn[16] = {
 +        [TCG_COND_EQ] = I3611_CMEQ,
 +        [TCG_COND_GT] = I3611_CMGT,
 +        [TCG_COND_GE] = I3611_CMGE,
 +        [TCG_COND_GTU] = I3611_CMHI,
 +        [TCG_COND_GEU] = I3611_CMHS,
 +    };
 +    static const AArch64Insn cmp0_vec_insn[16] = {
          [TCG_COND_EQ] = I3617_CMEQ0,
          [TCG_COND_GT] = I3617_CMGT0,
          [TCG_COND_GE] = I3617_CMGE0,
          [TCG_COND_LT] = I3617_CMLT0,
          [TCG_COND_LE] = I3617_CMLE0,
      };
 +    static const AArch64Insn cmp0_scalar_insn[16] = {
 +        [TCG_COND_EQ] = I3612_CMEQ0,
 +        [TCG_COND_GT] = I3612_CMGT0,
 +        [TCG_COND_GE] = I3612_CMGE0,
 +        [TCG_COND_LT] = I3612_CMLT0,
 +        [TCG_COND_LE] = I3612_CMLE0,
 +    };
      TCGType type = vecl + TCG_TYPE_V64;
      unsigned is_q = vecl;
 +    bool is_scalar = !is_q && vece == MO_64;
      TCGArg a0, a1, a2, a3;
      int cmode, imm8;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
          break;
      case INDEX_op_add_vec:
 -        tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_sub_vec:
 -        tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_mul_vec:
          tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
          break;
      case INDEX_op_neg_vec:
 -        tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3612, NEG, vece, a0, a1);
 +        } else {
 +            tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
 +        }
          break;
      case INDEX_op_abs_vec:
 -        tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3612, ABS, vece, a0, a1);
 +        } else {
 +            tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
 +        }
          break;
      case INDEX_op_and_vec:
          if (const_args[2]) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
          break;
      case INDEX_op_ssadd_vec:
 -        tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_sssub_vec:
 -        tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_usadd_vec:
 -        tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_ussub_vec:
 -        tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_smax_vec:
          tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
          tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
          break;
      case INDEX_op_shli_vec:
 -        tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
 +        } else {
 +            tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
 +        }
          break;
      case INDEX_op_shri_vec:
 -        tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
 +        } else {
 +            tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
 +        }
          break;
      case INDEX_op_sari_vec:
 -        tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
 +        } else {
 +            tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
 +        }
          break;
      case INDEX_op_aa64_sli_vec:
 -        tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
 +        } else {
 +            tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
 +        }
          break;
      case INDEX_op_shlv_vec:
 -        tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_aa64_sshl_vec:
 -        tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
 +        if (is_scalar) {
 +            tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
 +        } else {
 +            tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
 +        }
          break;
      case INDEX_op_cmp_vec:
          {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
              if (cond == TCG_COND_NE) {
                  if (const_args[2]) {
 -                    tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
 +                    if (is_scalar) {
 +                        tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
 +                    } else {
 +                        tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
 +                    }
                  } else {
 -                    tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
 +                    if (is_scalar) {
 +                        tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
 +                    } else {
 +                        tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
 +                    }
                      tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
                  }
              } else {
                  if (const_args[2]) {
 -                    insn = cmp0_insn[cond];
 -                    if (insn) {
 -                        tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
 -                        break;
 +                    if (is_scalar) {
 +                        insn = cmp0_scalar_insn[cond];
 +                        if (insn) {
 +                            tcg_out_insn_3612(s, insn, vece, a0, a1);
 +                            break;
 +                        }
 +                    } else {
 +                        insn = cmp0_vec_insn[cond];
 +                        if (insn) {
 +                            tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
 +                            break;
 +                        }
                      }
                      tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                      a2 = TCG_VEC_TMP;
                  }
 -                insn = cmp_insn[cond];
 -                if (insn == 0) {
 -                    TCGArg t;
 -                    t = a1, a1 = a2, a2 = t;
 -                    cond = tcg_swap_cond(cond);
 -                    insn = cmp_insn[cond];
 -                    tcg_debug_assert(insn != 0);
 +                if (is_scalar) {
 +                    insn = cmp_scalar_insn[cond];
 +                    if (insn == 0) {
 +                        TCGArg t;
 +                        t = a1, a1 = a2, a2 = t;
 +                        cond = tcg_swap_cond(cond);
 +                        insn = cmp_scalar_insn[cond];
 +                        tcg_debug_assert(insn != 0);
 +                    }
 +                    tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
 +                } else {
 +                    insn = cmp_vec_insn[cond];
 +                    if (insn == 0) {
 +                        TCGArg t;
 +                        t = a1, a1 = a2, a2 = t;
 +                        cond = tcg_swap_cond(cond);
 +                        insn = cmp_vec_insn[cond];
 +                        tcg_debug_assert(insn != 0);
 +                    }
 +                    tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
                  }
 -                tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
              }
          }
          break;
 --
 .25.1

-New patch
+[PULL 02/56] host-utils: move checks out of divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |  5 +++--
  include/qemu/host-utils.h | 34 ++++++++++++---------------------
  target/ppc/int_helper.c   | 14 +++++++++-----
  util/host-utils.c         | 40 ++++++++++++++++++---------------------
 files changed, 42 insertions(+), 51 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
 --
 .25.1

-New patch
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
+From: Luis Pires <luis.pires@eldorado.org.br>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat-macros.h | 82 ----------------------------------
  include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 files changed, 81 insertions(+), 82 deletions(-)
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat-macros.h
 +++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
   * so some portions are provided under:
   *  the SoftFloat-2a license
   *  the BSD license
 - *  GPL-v2-or-later
   *
   * Any future contributions to this file after December 1st 2014 will be
   * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
   * THE POSSIBILITY OF SUCH DAMAGE.
   */
 -/* Portions of this work are licensed under the terms of the GNU GPL,
 - * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-New patch
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 These will be used to implement new decimal floating point
 instructions from Power ISA 3.1.
 The remainder is now returned directly by divu128/divs128,
 freeing up phigh to receive the high 64 bits of the quotient.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |   6 +-
  include/qemu/host-utils.h |  20 ++++--
  target/ppc/int_helper.c   |   9 +--
  util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 files changed, 108 insertions(+), 60 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
      if (clk->period == 0) {
          return 0;
      }
 -    /*
 -     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-New patch
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
  tests/unit/meson.build   |   1 +
 files changed, 198 insertions(+)
  create mode 100644 tests/unit/test-div128.c
 diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Test 128-bit division functions
 + *
 + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/host-utils.h"
 +
 +typedef struct {
 +    uint64_t high;
 +    uint64_t low;
 +    uint64_t rhigh;
 +    uint64_t rlow;
 +    uint64_t divisor;
 +    uint64_t remainder;
 +} test_data_unsigned;
 +
 +typedef struct {
 +    int64_t high;
 +    uint64_t low;
 +    int64_t rhigh;
 +    uint64_t rlow;
 +    int64_t divisor;
 +    int64_t remainder;
 +} test_data_signed;
 +
 +static const test_data_unsigned test_table_unsigned[] = {
 +    /* Dividend fits in 64 bits */
 +    { 0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000003ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000002ULL, 0x0000000000000001ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0xa000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000002ULL,
 +      0x4000000000000000ULL, 0x2000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x8000000000000000ULL, 0x0000000000000000ULL},
 +
 +    /* Dividend > 64 bits, with MSB 0 */
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x000000000000000dULL,
 +      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
 +      0x0000000000000010ULL, 0x0000000000000001ULL},
 +
 +    /* Dividend > 64 bits, with MSB 1 */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
 +      0x0000000000000010ULL, 0x000000000000000fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
 +      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
 +
 +    /**
 +     * Divisor == 64 bits, with MSB 1
 +     * and high 64 bits of dividend >= divisor
 +     * (for testing normalization)
 +     */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0xfddbb9977553310aULL,
 +      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
 +
 +    /* Dividend > 64 bits, divisor almost as big */
 +    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
 +      0x0000000000000000ULL, 0x000000000000000fULL,
 +      0x123456789abcdefeULL, 0x123456789abcde1fULL},
 +};
 +
 +static const test_data_signed test_table_signed[] = {
 +    /* Positive dividend, positive/negative divisors */
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
 +    int i;
 +    uint64_t rem;
 +    test_data_unsigned tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +static void test_divs128(void)
 +{
 +    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/host-utils/test_divu128", test_divu128);
 +    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/meson.build
 +++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
    # all code tested by test-x86-cpuid is inside topology.h
    'test-x86-cpuid': [],
    'test-cutils': [],
 +  'test-div128': [],
    'test-shift128': [],
    'test-mul64': [],
    # all code tested by test-int128 is inside int128.h
 --
 .25.1

-[PATCH 18/27] tcg/tci: Reduce use of tci_read_r64
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
-In all cases restricted to 64-bit hosts, tcg_read_r is
+Prepare for tracking different masks by renaming this one.
 identical.  We retain the 64-bit symbol for the single
 case of INDEX_op_qemu_st_i64.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 93 +++++++++++++++++++++++++------------------------------
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
-file changed, 42 insertions(+), 51 deletions(-)
+file changed, 72 insertions(+), 70 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     return regs[index];
+     TCGTemp *prev_copy;
      TCGTemp *next_copy;
      uint64_t val;
 -    uint64_t mask;
 +    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
  } TempOptInfo;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->next_copy = ts;
      ti->prev_copy = ts;
      ti->is_const = false;
 -    ti->mask = -1;
 +    ti->z_mask = -1;
  }
--#if TCG_TARGET_REG_BITS == 64
+ static void reset_temp(TCGArg arg)
--static uint64_t tci_read_reg64(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--{
+     if (ts->kind == TEMP_CONST) {
--    return tci_read_reg(regs, index);
+         ti->is_const = true;
--}
+         ti->val = ts->val;
--#endif
+-        ti->mask = ts->val;
--
++        ti->z_mask = ts->val;
- static void
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
- tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
+             /* High bits of a 32-bit quantity are garbage.  */
- {
+-            ti->mask |= ~0xffffffffull;
-@@ -XXX,XX +XXX,XX @@ static uint64_t tci_read_r64(const tcg_target_ulong *regs,
++            ti->z_mask |= ~0xffffffffull;
- static uint64_t tci_read_r64(const tcg_target_ulong *regs,
+         }
-                              const uint8_t **tb_ptr)
+     } else {
- {
+         ti->is_const = false;
--    uint64_t value = tci_read_reg64(regs, **tb_ptr);
+-        ti->mask = -1;
--    *tb_ptr += 1;
++        ti->z_mask = -1;
--    return value;
+     }
 +    return tci_read_r(regs, tb_ptr);
  }
- #endif
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+     const TCGOpDef *def;
- #elif TCG_TARGET_REG_BITS == 64
+     TempOptInfo *di;
-         case INDEX_op_setcond_i64:
+     TempOptInfo *si;
-             t0 = *tb_ptr++;
+-    uint64_t mask;
--            t1 = tci_read_r64(regs, &tb_ptr);
++    uint64_t z_mask;
--            t2 = tci_read_r64(regs, &tb_ptr);
+     TCGOpcode new_op;
-+            t1 = tci_read_r(regs, &tb_ptr);
-+            t2 = tci_read_r(regs, &tb_ptr);
+     if (ts_are_copies(dst_ts, src_ts)) {
-             condition = *tb_ptr++;
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
-             tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
+     op->args[0] = dst;
-             break;
+     op->args[1] = src;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
- #if TCG_TARGET_REG_BITS == 64
+-    mask = si->mask;
-         case INDEX_op_mov_i64:
++    z_mask = si->z_mask;
-             t0 = *tb_ptr++;
+     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
--            t1 = tci_read_r64(regs, &tb_ptr);
+         /* High bits of the destination are now garbage.  */
-+            t1 = tci_read_r(regs, &tb_ptr);
+-        mask |= ~0xffffffffull;
-             tci_write_reg(regs, t0, t1);
++        z_mask |= ~0xffffffffull;
-             break;
+     }
-         case INDEX_op_tci_movi_i64:
+-    di->mask = mask;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++    di->z_mask = z_mask;
-             tci_write_reg(regs, t0, *(uint64_t *)(t1 + t2));
-             break;
+     if (src_ts->type == dst_ts->type) {
-         case INDEX_op_st_i64:
+         TempOptInfo *ni = ts_info(si->next_copy);
--            t0 = tci_read_r64(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+            t0 = tci_read_r(regs, &tb_ptr);
+     }
-             t1 = tci_read_r(regs, &tb_ptr);
-             t2 = tci_read_s32(&tb_ptr);
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-             *(uint64_t *)(t1 + t2) = t0;
+-        uint64_t mask, partmask, affected, tmp;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
-         case INDEX_op_add_i64:
+         TCGOpcode opc = op->opc;
-             t0 = *tb_ptr++;
+         const TCGOpDef *def = &tcg_op_defs[opc];
--            t1 = tci_read_r64(regs, &tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            t2 = tci_read_r64(regs, &tb_ptr);
-+            t1 = tci_read_r(regs, &tb_ptr);
+         /* Simplify using known-zero bits. Currently only ops with a single
-+            t2 = tci_read_r(regs, &tb_ptr);
+            output argument is supported. */
-             tci_write_reg(regs, t0, t1 + t2);
+-        mask = -1;
-             break;
++        z_mask = -1;
-         case INDEX_op_sub_i64:
+         affected = -1;
-             t0 = *tb_ptr++;
+         switch (opc) {
--            t1 = tci_read_r64(regs, &tb_ptr);
+         CASE_OP_32_64(ext8s):
--            t2 = tci_read_r64(regs, &tb_ptr);
+-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
-+            t1 = tci_read_r(regs, &tb_ptr);
++            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-+            t2 = tci_read_r(regs, &tb_ptr);
+                 break;
-             tci_write_reg(regs, t0, t1 - t2);
+             }
-             break;
+             QEMU_FALLTHROUGH;
-         case INDEX_op_mul_i64:
+         CASE_OP_32_64(ext8u):
-             t0 = *tb_ptr++;
+-            mask = 0xff;
--            t1 = tci_read_r64(regs, &tb_ptr);
++            z_mask = 0xff;
--            t2 = tci_read_r64(regs, &tb_ptr);
+             goto and_const;
-+            t1 = tci_read_r(regs, &tb_ptr);
+         CASE_OP_32_64(ext16s):
-+            t2 = tci_read_r(regs, &tb_ptr);
+-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
-             tci_write_reg(regs, t0, t1 * t2);
++            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-             break;
+                 break;
-         case INDEX_op_div_i64:
+             }
-             t0 = *tb_ptr++;
+             QEMU_FALLTHROUGH;
--            t1 = tci_read_r64(regs, &tb_ptr);
+         CASE_OP_32_64(ext16u):
--            t2 = tci_read_r64(regs, &tb_ptr);
+-            mask = 0xffff;
-+            t1 = tci_read_r(regs, &tb_ptr);
++            z_mask = 0xffff;
-+            t2 = tci_read_r(regs, &tb_ptr);
+             goto and_const;
-             tci_write_reg(regs, t0, (int64_t)t1 / (int64_t)t2);
+         case INDEX_op_ext32s_i64:
-             break;
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
-         case INDEX_op_divu_i64:
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-             t0 = *tb_ptr++;
+                 break;
--            t1 = tci_read_r64(regs, &tb_ptr);
+             }
--            t2 = tci_read_r64(regs, &tb_ptr);
+             QEMU_FALLTHROUGH;
-+            t1 = tci_read_r(regs, &tb_ptr);
+         case INDEX_op_ext32u_i64:
-+            t2 = tci_read_r(regs, &tb_ptr);
+-            mask = 0xffffffffU;
-             tci_write_reg(regs, t0, (uint64_t)t1 / (uint64_t)t2);
++            z_mask = 0xffffffffU;
-             break;
+             goto and_const;
-         case INDEX_op_rem_i64:
-             t0 = *tb_ptr++;
+         CASE_OP_32_64(and):
--            t1 = tci_read_r64(regs, &tb_ptr);
+-            mask = arg_info(op->args[2])->mask;
--            t2 = tci_read_r64(regs, &tb_ptr);
++            z_mask = arg_info(op->args[2])->z_mask;
-+            t1 = tci_read_r(regs, &tb_ptr);
+             if (arg_is_const(op->args[2])) {
-+            t2 = tci_read_r(regs, &tb_ptr);
+         and_const:
-             tci_write_reg(regs, t0, (int64_t)t1 % (int64_t)t2);
+-                affected = arg_info(op->args[1])->mask & ~mask;
-             break;
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-         case INDEX_op_remu_i64:
+             }
-             t0 = *tb_ptr++;
+-            mask = arg_info(op->args[1])->mask & mask;
--            t1 = tci_read_r64(regs, &tb_ptr);
++            z_mask = arg_info(op->args[1])->z_mask & z_mask;
--            t2 = tci_read_r64(regs, &tb_ptr);
+             break;
-+            t1 = tci_read_r(regs, &tb_ptr);
-+            t2 = tci_read_r(regs, &tb_ptr);
+         case INDEX_op_ext_i32_i64:
-             tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
-             break;
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-         case INDEX_op_and_i64:
+                 break;
-             t0 = *tb_ptr++;
+             }
--            t1 = tci_read_r64(regs, &tb_ptr);
+             QEMU_FALLTHROUGH;
--            t2 = tci_read_r64(regs, &tb_ptr);
+         case INDEX_op_extu_i32_i64:
-+            t1 = tci_read_r(regs, &tb_ptr);
+             /* We do not compute affected as it is a size changing op.  */
-+            t2 = tci_read_r(regs, &tb_ptr);
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
-             tci_write_reg(regs, t0, t1 & t2);
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
              break;
-         case INDEX_op_or_i64:
-             t0 = *tb_ptr++;
+         CASE_OP_32_64(andc):
--            t1 = tci_read_r64(regs, &tb_ptr);
+             /* Known-zeros does not imply known-ones.  Therefore unless
--            t2 = tci_read_r64(regs, &tb_ptr);
+                op->args[2] is constant, we can't infer anything from it.  */
-+            t1 = tci_read_r(regs, &tb_ptr);
+             if (arg_is_const(op->args[2])) {
-+            t2 = tci_read_r(regs, &tb_ptr);
+-                mask = ~arg_info(op->args[2])->mask;
-             tci_write_reg(regs, t0, t1 | t2);
++                z_mask = ~arg_info(op->args[2])->z_mask;
-             break;
+                 goto and_const;
-         case INDEX_op_xor_i64:
+             }
-             t0 = *tb_ptr++;
+             /* But we certainly know nothing outside args[1] may be set. */
--            t1 = tci_read_r64(regs, &tb_ptr);
+-            mask = arg_info(op->args[1])->mask;
--            t2 = tci_read_r64(regs, &tb_ptr);
++            z_mask = arg_info(op->args[1])->z_mask;
-+            t1 = tci_read_r(regs, &tb_ptr);
+             break;
-+            t2 = tci_read_r(regs, &tb_ptr);
-             tci_write_reg(regs, t0, t1 ^ t2);
+         case INDEX_op_sar_i32:
-             break;
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 31;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-         case INDEX_op_shl_i64:
+             }
-             t0 = *tb_ptr++;
+             break;
--            t1 = tci_read_r64(regs, &tb_ptr);
+         case INDEX_op_sar_i64:
--            t2 = tci_read_r64(regs, &tb_ptr);
+             if (arg_is_const(op->args[2])) {
-+            t1 = tci_read_r(regs, &tb_ptr);
+                 tmp = arg_info(op->args[2])->val & 63;
-+            t2 = tci_read_r(regs, &tb_ptr);
+-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
-             tci_write_reg(regs, t0, t1 << (t2 & 63));
++                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i64:
-             t0 = *tb_ptr++;
+             if (arg_is_const(op->args[2])) {
--            t1 = tci_read_r64(regs, &tb_ptr);
+                 tmp = arg_info(op->args[2])->val & 63;
--            t2 = tci_read_r64(regs, &tb_ptr);
+-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
-+            t1 = tci_read_r(regs, &tb_ptr);
++                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-+            t2 = tci_read_r(regs, &tb_ptr);
+             }
-             tci_write_reg(regs, t0, t1 >> (t2 & 63));
+             break;
-             break;
-         case INDEX_op_sar_i64:
+         case INDEX_op_extrl_i64_i32:
-             t0 = *tb_ptr++;
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
--            t1 = tci_read_r64(regs, &tb_ptr);
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
--            t2 = tci_read_r64(regs, &tb_ptr);
+             break;
-+            t1 = tci_read_r(regs, &tb_ptr);
+         case INDEX_op_extrh_i64_i32:
-+            t2 = tci_read_r(regs, &tb_ptr);
+-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
-             tci_write_reg(regs, t0, ((int64_t)t1 >> (t2 & 63)));
++            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
              break;
- #if TCG_TARGET_HAS_rot_i64
-         case INDEX_op_rotl_i64:
+         CASE_OP_32_64(shl):
-             t0 = *tb_ptr++;
+             if (arg_is_const(op->args[2])) {
--            t1 = tci_read_r64(regs, &tb_ptr);
+                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
--            t2 = tci_read_r64(regs, &tb_ptr);
+-                mask = arg_info(op->args[1])->mask << tmp;
-+            t1 = tci_read_r(regs, &tb_ptr);
++                z_mask = arg_info(op->args[1])->z_mask << tmp;
-+            t2 = tci_read_r(regs, &tb_ptr);
+             }
-             tci_write_reg(regs, t0, rol64(t1, t2 & 63));
+             break;
-             break;
-         case INDEX_op_rotr_i64:
+         CASE_OP_32_64(neg):
-             t0 = *tb_ptr++;
+             /* Set to 1 all bits to the left of the rightmost.  */
--            t1 = tci_read_r64(regs, &tb_ptr);
+-            mask = -(arg_info(op->args[1])->mask
--            t2 = tci_read_r64(regs, &tb_ptr);
+-                     & -arg_info(op->args[1])->mask);
-+            t1 = tci_read_r(regs, &tb_ptr);
++            z_mask = -(arg_info(op->args[1])->z_mask
-+            t2 = tci_read_r(regs, &tb_ptr);
++                       & -arg_info(op->args[1])->z_mask);
-             tci_write_reg(regs, t0, ror64(t1, t2 & 63));
+             break;
-             break;
- #endif
+         CASE_OP_32_64(deposit):
- #if TCG_TARGET_HAS_deposit_i64
+-            mask = deposit64(arg_info(op->args[1])->mask,
-         case INDEX_op_deposit_i64:
+-                             op->args[3], op->args[4],
-             t0 = *tb_ptr++;
+-                             arg_info(op->args[2])->mask);
--            t1 = tci_read_r64(regs, &tb_ptr);
++            z_mask = deposit64(arg_info(op->args[1])->z_mask,
--            t2 = tci_read_r64(regs, &tb_ptr);
++                               op->args[3], op->args[4],
-+            t1 = tci_read_r(regs, &tb_ptr);
++                               arg_info(op->args[2])->z_mask);
-+            t2 = tci_read_r(regs, &tb_ptr);
+             break;
-             tmp16 = *tb_ptr++;
-             tmp8 = *tb_ptr++;
+         CASE_OP_32_64(extract):
-             tmp64 = (((1ULL << tmp8) - 1) << tmp16);
+-            mask = extract64(arg_info(op->args[1])->mask,
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                             op->args[2], op->args[3]);
-             break;
++            z_mask = extract64(arg_info(op->args[1])->z_mask,
- #endif
++                               op->args[2], op->args[3]);
-         case INDEX_op_brcond_i64:
+             if (op->args[2] == 0) {
--            t0 = tci_read_r64(regs, &tb_ptr);
+-                affected = arg_info(op->args[1])->mask & ~mask;
--            t1 = tci_read_r64(regs, &tb_ptr);
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-+            t0 = tci_read_r(regs, &tb_ptr);
+             }
-+            t1 = tci_read_r(regs, &tb_ptr);
+             break;
-             condition = *tb_ptr++;
+         CASE_OP_32_64(sextract):
-             label = tci_read_label(&tb_ptr);
+-            mask = sextract64(arg_info(op->args[1])->mask,
-             if (tci_compare64(t0, t1, condition)) {
+-                              op->args[2], op->args[3]);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
- #if TCG_TARGET_HAS_bswap64_i64
+-                affected = arg_info(op->args[1])->mask & ~mask;
-         case INDEX_op_bswap64_i64:
++            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-             t0 = *tb_ptr++;
++                                op->args[2], op->args[3]);
--            t1 = tci_read_r64(regs, &tb_ptr);
++            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-+            t1 = tci_read_r(regs, &tb_ptr);
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-             tci_write_reg(regs, t0, bswap64(t1));
+             }
              break;
- #endif
- #if TCG_TARGET_HAS_not_i64
+         CASE_OP_32_64(or):
-         case INDEX_op_not_i64:
+         CASE_OP_32_64(xor):
-             t0 = *tb_ptr++;
+-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
--            t1 = tci_read_r64(regs, &tb_ptr);
++            z_mask = arg_info(op->args[1])->z_mask
-+            t1 = tci_read_r(regs, &tb_ptr);
++                   | arg_info(op->args[2])->z_mask;
-             tci_write_reg(regs, t0, ~t1);
+             break;
-             break;
- #endif
+         case INDEX_op_clz_i32:
- #if TCG_TARGET_HAS_neg_i64
+         case INDEX_op_ctz_i32:
-         case INDEX_op_neg_i64:
+-            mask = arg_info(op->args[2])->mask | 31;
-             t0 = *tb_ptr++;
++            z_mask = arg_info(op->args[2])->z_mask | 31;
--            t1 = tci_read_r64(regs, &tb_ptr);
+             break;
-+            t1 = tci_read_r(regs, &tb_ptr);
-             tci_write_reg(regs, t0, -t1);
+         case INDEX_op_clz_i64:
-             break;
+         case INDEX_op_ctz_i64:
- #endif
+-            mask = arg_info(op->args[2])->mask | 63;
 +            z_mask = arg_info(op->args[2])->z_mask | 63;
              break;
          case INDEX_op_ctpop_i32:
 -            mask = 32 | 31;
 +            z_mask = 32 | 31;
              break;
          case INDEX_op_ctpop_i64:
 -            mask = 64 | 63;
 +            z_mask = 64 | 63;
              break;
          CASE_OP_32_64(setcond):
          case INDEX_op_setcond2_i32:
 -            mask = 1;
 +            z_mask = 1;
              break;
          CASE_OP_32_64(movcond):
 -            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
 +            z_mask = arg_info(op->args[3])->z_mask
 +                   | arg_info(op->args[4])->z_mask;
              break;
          CASE_OP_32_64(ld8u):
 -            mask = 0xff;
 +            z_mask = 0xff;
              break;
          CASE_OP_32_64(ld16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              break;
          case INDEX_op_ld32u_i64:
 -            mask = 0xffffffffu;
 +            z_mask = 0xffffffffu;
              break;
          CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                  MemOp mop = get_memop(oi);
                  if (!(mop & MO_SIGN)) {
 -                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 +                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                  }
              }
              break;
          CASE_OP_32_64(bswap16):
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffff) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffff) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap16(mask);
 +            z_mask = bswap16(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int16_t)mask;
 +                z_mask = (int16_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(16, 48);
 +                z_mask |= MAKE_64BIT_MASK(16, 48);
                  break;
              }
              break;
          case INDEX_op_bswap32_i64:
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffffffffu) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffffffffu) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap32(mask);
 +            z_mask = bswap32(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int32_t)mask;
 +                z_mask = (int32_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(32, 32);
 +                z_mask |= MAKE_64BIT_MASK(32, 32);
                  break;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          /* 32-bit ops generate 32-bit results.  For the result is zero test
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
 -        partmask = mask;
 +        partmask = z_mask;
          if (!(def->flags & TCG_OPF_64BIT)) {
 -            mask |= ~(tcg_target_ulong)0xffffffffu;
 +            z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     vs the high word of the input.  */
              do_setcond_high:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              do_setcond_low:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[2] = op->args[3];
                  op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              /* Default case: we know nothing about operation (or were unable
                 to compute the operation result) so no propagation is done.
                 We trash everything if the operation is the end of a basic
 -               block, otherwise we only trash the output args.  "mask" is
 +               block, otherwise we only trash the output args.  "z_mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
                  memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Save the corresponding known-zero bits mask for the
                         first output argument (only one supported so far). */
                      if (i == 0) {
 -                        arg_info(op->args[i])->mask = mask;
 +                        arg_info(op->args[i])->z_mask = z_mask;
                      }
                  }
              }
 --
 .25.1

-New patch
+[PULL 07/56] tcg/optimize: Split out OptContext
+Provide what will become a larger context for splitting
+the very large tcg_optimize function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
+file changed, 40 insertions(+), 37 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
++typedef struct OptContext {
++    TCGTempSet temps_used;
++} OptContext;
++
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+ {
+     return ts->state_ptr;
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+ }
+ /* Initialize and activate a temporary.  */
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+ {
+     size_t idx = temp_idx(ts);
+     TempOptInfo *ti;
+-    if (test_bit(idx, temps_used->l)) {
++    if (test_bit(idx, ctx->temps_used.l)) {
+         return;
+     }
+-    set_bit(idx, temps_used->l);
++    set_bit(idx, ctx->temps_used.l);
+     ti = ts->state_ptr;
+     if (ti == NULL) {
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     }
+ }
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
++static void init_arg_info(OptContext *ctx, TCGArg arg)
+ {
+-    init_ts_info(temps_used, arg_temp(arg));
++    init_ts_info(ctx, arg_temp(arg));
+ }
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     }
+ }
+-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
++static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
+                              TCGOp *op, TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+-    init_ts_info(temps_used, tv);
++    init_ts_info(ctx, tv);
+     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+     TCGOp *op, *op_next, *prev_mb = NULL;
+-    TCGTempSet temps_used;
++    OptContext ctx = {};
+     /* Array VALS has an element for each temp.
+        If this temp holds a constant then its value is kept in VALS' element.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     nb_temps = s->nb_temps;
+     nb_globals = s->nb_globals;
+-    memset(&temps_used, 0, sizeof(temps_used));
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                 TCGTemp *ts = arg_temp(op->args[i]);
+                 if (ts) {
+-                    init_ts_info(&temps_used, ts);
++                    init_ts_info(&ctx, ts);
+                 }
+             }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&temps_used, op->args[i]);
++                init_arg_info(&ctx, op->args[i]);
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(rotr):
+             if (arg_is_const(op->args[1])
+                 && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (partmask == 0) {
+             tcg_debug_assert(nb_oargs == 1);
+-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(mulsh):
+             if (arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+         CASE_OP_32_64_VEC(xor):
+             if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 TCGArg v = arg_info(op->args[1])->val;
+                 if (v != 0) {
+                     tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = deposit64(arg_info(op->args[1])->val,
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                            op->args[1], op->args[2]);
+             if (tmp != 2) {
+                 if (tmp) {
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[3];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+                 if (tmp) {
+             do_brcond_true:
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[5];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     goto do_default;
+                 }
+             do_brcond_low:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                             op->args[5]);
+             if (tmp != 2) {
+             do_setcond_const:
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+             } else if ((op->args[5] == TCG_COND_LT
+                         || op->args[5] == TCG_COND_GE)
+                        && arg_is_const(op->args[3])
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (!(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, temps_used.l)) {
++                    if (test_bit(i, ctx.temps_used.l)) {
+                         reset_ts(&s->temps[i]);
+                     }
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+             } else {
+         do_reset_output:
+                 for (i = 0; i < nb_oargs; i++) {
+--
+.25.1

-New patch
+[PULL 08/56] tcg/optimize: Remove do_default label
+Break the final cleanup clause out of the main switch
+statement.  When fully folding an opcode to mov/movi,
+use "continue" to process the next opcode, else break
+to fall into the final cleanup.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
+file changed, 94 insertions(+), 96 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+-            break;
++            continue;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+-                break;
++                continue;
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+                 nb_iargs = 1;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(not):
+         CASE_OP_32_64(neg):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(add):
+         CASE_OP_32_64(sub):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(deposit):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(sextract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract2):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(setcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(brcond):
+             tmp = do_constant_folding_cond(opc, op->args[0],
+                                            op->args[1], op->args[2]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[3];
+-                } else {
+-                    tcg_op_remove(s, op);
+-                }
++            switch (tmp) {
++            case 0:
++                tcg_op_remove(s, op);
++                continue;
++            case 1:
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[3];
+                 break;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(movcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[5]);
+             if (tmp != 2) {
+                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+-                break;
++                continue;
+             }
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+                 uint64_t tv = arg_info(op->args[3])->val;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (fv == 1 && tv == 0) {
+                     cond = tcg_invert_cond(cond);
+                 } else if (!(tv == 1 && fv == 0)) {
+-                    goto do_default;
++                    break;
+                 }
+                 op->args[3] = cond;
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                  : INDEX_op_setcond_i64);
+                 nb_iargs = 2;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_add2_i32:
+         case INDEX_op_sub2_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_mulu2_i32:
+             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_brcond2_i32:
+             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                             op->args[4]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-            do_brcond_true:
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[5];
+-                } else {
++            if (tmp == 0) {
+             do_brcond_false:
+-                    tcg_op_remove(s, op);
+-                }
+-            } else if ((op->args[4] == TCG_COND_LT
+-                        || op->args[4] == TCG_COND_GE)
+-                       && arg_is_const(op->args[2])
+-                       && arg_info(op->args[2])->val == 0
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0) {
++                tcg_op_remove(s, op);
++                continue;
++            }
++            if (tmp == 1) {
++            do_brcond_true:
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[5];
++                break;
++            }
++            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
++                 && arg_is_const(op->args[2])
++                 && arg_info(op->args[2])->val == 0
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
++                op->opc = opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_brcond_false;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_brcond_low:
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_brcond_true;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+             do_setcond_const:
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-            } else if ((op->args[5] == TCG_COND_LT
+-                        || op->args[5] == TCG_COND_GE)
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0
+-                       && arg_is_const(op->args[4])
+-                       && arg_info(op->args[4])->val == 0) {
++                continue;
++            }
++            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0
++                 && arg_is_const(op->args[4])
++                 && arg_info(op->args[4])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_setcond_high:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_setcond_high;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_setcond_const;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+-        case INDEX_op_call:
+-            if (!(tcg_call_flags(op)
++        default:
++            break;
++        }
++
++        /* Some of the folding above can change opc. */
++        opc = op->opc;
++        def = &tcg_op_defs[opc];
++        if (def->flags & TCG_OPF_BB_END) {
++            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++        } else {
++            if (opc == INDEX_op_call &&
++                !(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+                     if (test_bit(i, ctx.temps_used.l)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     }
+                 }
+             }
+-            goto do_reset_output;
+-        default:
+-        do_default:
+-            /* Default case: we know nothing about operation (or were unable
+-               to compute the operation result) so no propagation is done.
+-               We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "z_mask" is
+-               the non-zero bits mask for the first output arg.  */
+-            if (def->flags & TCG_OPF_BB_END) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-            } else {
+-        do_reset_output:
+-                for (i = 0; i < nb_oargs; i++) {
+-                    reset_temp(op->args[i]);
+-                    /* Save the corresponding known-zero bits mask for the
+-                       first output argument (only one supported so far). */
+-                    if (i == 0) {
+-                        arg_info(op->args[i])->z_mask = z_mask;
+-                    }
++            for (i = 0; i < nb_oargs; i++) {
++                reset_temp(op->args[i]);
++                /* Save the corresponding known-zero bits mask for the
++                   first output argument (only one supported so far). */
++                if (i == 0) {
++                    arg_info(op->args[i])->z_mask = z_mask;
+                 }
+             }
+-            break;
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+--
+.25.1

-New patch
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+Adjust the interface to take the OptContext parameter instead
 of TCGContext or both.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 file changed, 34 insertions(+), 33 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
  } TempOptInfo;
  typedef struct OptContext {
 +    TCGContext *tcg;
      TCGTempSet temps_used;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
      return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
 -static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
 -        tcg_op_remove(s, op);
 +        tcg_op_remove(ctx->tcg, op);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
 -static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
 -                             TCGOp *op, TCGArg dst, uint64_t val)
 +static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
 +                             TCGArg dst, uint64_t val)
  {
      const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
      init_ts_info(ctx, tv);
 -    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals, i;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    OptContext ctx = {};
 +    OptContext ctx = { .tcg = s };
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(or):
          CASE_OP_32_64_VEC(and):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             allocator where needed and possible.  Also detect copies. */
          switch (opc) {
          CASE_OP_32_64_VEC(mov):
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-New patch
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
+This will expose the variable to subroutines that
+will be broken out of tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+ typedef struct OptContext {
+     TCGContext *tcg;
++    TCGOp *prev_mb;
+     TCGTempSet temps_used;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+-    TCGOp *op, *op_next, *prev_mb = NULL;
++    TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+     /* Array VALS has an element for each temp.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+-        if (prev_mb) {
++        if (ctx.prev_mb) {
+             switch (opc) {
+             case INDEX_op_mb:
+                 /* Merge two barriers of the same type into one,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                  * barrier.  This is stricter than specified but for
+                  * the purposes of TCG is better than not optimizing.
+                  */
+-                prev_mb->args[0] |= op->args[0];
++                ctx.prev_mb->args[0] |= op->args[0];
+                 tcg_op_remove(s, op);
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i64:
+             case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+-                prev_mb = NULL;
++                ctx.prev_mb = NULL;
+                 break;
+             }
+         } else if (opc == INDEX_op_mb) {
+-            prev_mb = op;
++            ctx.prev_mb = op;
+         }
+     }
+ }
+--
+.25.1

-[PATCH 16/27] tcg/tci: Remove tci_read_r32
+[PULL 11/56] tcg/optimize: Split out init_arguments
-Use explicit casts for ext32u opcodes, and allow truncation
+There was no real reason for calls to have separate code here.
-to happen for other users.
+Unify init for calls vs non-calls using the call path, which
 handles TCG_CALL_DUMMY_ARG.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 122 ++++++++++++++++++++++++------------------------------
+ tcg/optimize.c | 25 +++++++++++--------------
-file changed, 54 insertions(+), 68 deletions(-)
+file changed, 11 insertions(+), 14 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
      }
  }
- #endif
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
 -static uint32_t tci_read_reg32(const tcg_target_ulong *regs, TCGReg index)
 -{
--    return (uint32_t)tci_read_reg(regs, index);
+-    init_ts_info(ctx, arg_temp(arg));
 -}
 -
- #if TCG_TARGET_REG_BITS == 64
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
  static uint64_t tci_read_reg64(const tcg_target_ulong *regs, TCGReg index)
  {
-@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     TCGTemp *i, *g, *l;
-     return value;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
      return false;
  }
--/* Read indexed register (32 bit) from bytecode. */
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
--static uint32_t tci_read_r32(const tcg_target_ulong *regs,
++{
--                             const uint8_t **tb_ptr)
++    for (int i = 0; i < nb_args; i++) {
--{
++        TCGTemp *ts = arg_temp(op->args[i]);
--    uint32_t value = tci_read_reg32(regs, **tb_ptr);
++        if (ts) {
--    *tb_ptr += 1;
++            init_ts_info(ctx, ts);
--    return value;
++        }
--}
++    }
--
++}
- #if TCG_TARGET_REG_BITS == 32
++
- /* Read two indexed registers (2 * 32 bit) from bytecode. */
+ /* Propagate constants and copies, fold constant expressions. */
- static uint64_t tci_read_r64(const tcg_target_ulong *regs,
+ void tcg_optimize(TCGContext *s)
                               const uint8_t **tb_ptr)
  {
--    uint32_t low = tci_read_r32(regs, tb_ptr);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    return tci_uint64(tci_read_r32(regs, tb_ptr), low);
+         if (opc == INDEX_op_call) {
-+    uint32_t low = tci_read_r(regs, tb_ptr);
+             nb_oargs = TCGOP_CALLO(op);
-+    return tci_uint64(tci_read_r(regs, tb_ptr), low);
+             nb_iargs = TCGOP_CALLI(op);
- }
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
- #elif TCG_TARGET_REG_BITS == 64
+-                TCGTemp *ts = arg_temp(op->args[i]);
- /* Read indexed register (32 bit signed) from bytecode. */
+-                if (ts) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                    init_ts_info(&ctx, ts);
-             continue;
+-                }
-         case INDEX_op_setcond_i32:
+-            }
-             t0 = *tb_ptr++;
+         } else {
--            t1 = tci_read_r32(regs, &tb_ptr);
+             nb_oargs = def->nb_oargs;
--            t2 = tci_read_r32(regs, &tb_ptr);
+             nb_iargs = def->nb_iargs;
-+            t1 = tci_read_r(regs, &tb_ptr);
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-+            t2 = tci_read_r(regs, &tb_ptr);
+-                init_arg_info(&ctx, op->args[i]);
-             condition = *tb_ptr++;
+-            }
-             tci_write_reg(regs, t0, tci_compare32(t1, t2, condition));
+         }
-             break;
++        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
- #endif
+         /* Do copy propagation */
-         case INDEX_op_mov_i32:
+         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1);
              break;
          case INDEX_op_tci_movi_i32:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              break;
          case INDEX_op_st_i32:
          CASE_64(st32)
 -            t0 = tci_read_r32(regs, &tb_ptr);
 +            t0 = tci_read_r(regs, &tb_ptr);
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_s32(&tb_ptr);
              *(uint32_t *)(t1 + t2) = t0;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_add_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 + t2);
              break;
          case INDEX_op_sub_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 - t2);
              break;
          case INDEX_op_mul_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 * t2);
              break;
          case INDEX_op_div_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int32_t)t1 / (int32_t)t2);
              break;
          case INDEX_op_divu_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 / t2);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 / (uint32_t)t2);
              break;
          case INDEX_op_rem_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (int32_t)t1 % (int32_t)t2);
              break;
          case INDEX_op_remu_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 % t2);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 % (uint32_t)t2);
              break;
          case INDEX_op_and_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 & t2);
              break;
          case INDEX_op_or_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 | t2);
              break;
          case INDEX_op_xor_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 ^ t2);
              break;
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_shl_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 << (t2 & 31));
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 << (t2 & 31));
              break;
          case INDEX_op_shr_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 >> (t2 & 31));
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1 >> (t2 & 31));
              break;
          case INDEX_op_sar_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, ((int32_t)t1 >> (t2 & 31)));
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int32_t)t1 >> (t2 & 31));
              break;
  #if TCG_TARGET_HAS_rot_i32
          case INDEX_op_rotl_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, rol32(t1, t2 & 31));
              break;
          case INDEX_op_rotr_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ror32(t1, t2 & 31));
              break;
  #endif
  #if TCG_TARGET_HAS_deposit_i32
          case INDEX_op_deposit_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            t2 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
              tmp16 = *tb_ptr++;
              tmp8 = *tb_ptr++;
              tmp32 = (((1 << tmp8) - 1) << tmp16);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              break;
  #endif
          case INDEX_op_brcond_i32:
 -            t0 = tci_read_r32(regs, &tb_ptr);
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t0 = tci_read_r(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              condition = *tb_ptr++;
              label = tci_read_label(&tb_ptr);
              if (tci_compare32(t0, t1, condition)) {
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
          case INDEX_op_mulu2_i32:
              t0 = *tb_ptr++;
              t1 = *tb_ptr++;
 -            t2 = tci_read_r32(regs, &tb_ptr);
 -            tmp64 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg64(regs, t1, t0, t2 * tmp64);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tmp64 = (uint32_t)tci_read_r(regs, &tb_ptr);
 +            tci_write_reg64(regs, t1, t0, (uint32_t)t2 * tmp64);
              break;
  #endif /* TCG_TARGET_REG_BITS == 32 */
  #if TCG_TARGET_HAS_ext8s_i32
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_bswap32_i32
          case INDEX_op_bswap32_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap32(t1));
              break;
  #endif
  #if TCG_TARGET_HAS_not_i32
          case INDEX_op_not_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ~t1);
              break;
  #endif
  #if TCG_TARGET_HAS_neg_i32
          case INDEX_op_neg_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, -t1);
              break;
  #endif
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #endif
          case INDEX_op_extu_i32_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (uint32_t)t1);
              break;
  #if TCG_TARGET_HAS_bswap16_i64
          case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_bswap32_i64
          case INDEX_op_bswap32_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32(regs, &tb_ptr);
 +            t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap32(t1));
              break;
  #endif
 --
 .25.1

-New patch
+[PULL 12/56] tcg/optimize: Split out copy_propagate
+Continue splitting tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 22 ++++++++++++++--------
+file changed, 14 insertions(+), 8 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+     }
+ }
++static void copy_propagate(OptContext *ctx, TCGOp *op,
++                           int nb_oargs, int nb_iargs)
++{
++    TCGContext *s = ctx->tcg;
++
++    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
++        TCGTemp *ts = arg_temp(op->args[i]);
++        if (ts && ts_is_copy(ts)) {
++            op->args[i] = temp_arg(find_better_copy(s, ts));
++        }
++    }
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             nb_iargs = def->nb_iargs;
+         }
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-
+-        /* Do copy propagation */
+-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+-            TCGTemp *ts = arg_temp(op->args[i]);
+-            if (ts && ts_is_copy(ts)) {
+-                op->args[i] = temp_arg(find_better_copy(s, ts));
+-            }
+-        }
++        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+--
+.25.1

-[PATCH 06/27] tcg: Manage splitwx in tc_ptr_to_region_tree by hand
+[PULL 13/56] tcg/optimize: Split out fold_call
-The use in tcg_tb_lookup is given a random pc that comes from the pc
+Calls are special in that they have a variable number
-of a signal handler.  Do not assert that the pointer is already within
+of arguments, and need to be able to clobber globals.
 the code gen buffer at all, much less the writable mirror of it.
-Fixes: db0c51a3803
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 20 ++++++++++++++++++--
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
-file changed, 18 insertions(+), 2 deletions(-)
+file changed, 41 insertions(+), 22 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_region_trees_init(void)
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
      }
  }
--static struct tcg_region_tree *tc_ptr_to_region_tree(const void *cp)
++static bool fold_call(OptContext *ctx, TCGOp *op)
-+static struct tcg_region_tree *tc_ptr_to_region_tree(const void *p)
++{
- {
++    TCGContext *s = ctx->tcg;
--    void *p = tcg_splitwx_to_rw(cp);
++    int nb_oargs = TCGOP_CALLO(op);
-     size_t region_idx;
++    int nb_iargs = TCGOP_CALLI(op);
++    int flags, i;
-+    /*
++
-+     * Like tcg_splitwx_to_rw, with no assert.  The pc may come from
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
-+     * a signal handler over which the caller has no control.
++    copy_propagate(ctx, op, nb_oargs, nb_iargs);
-+     */
++
-+    if (!in_code_gen_buffer(p)) {
++    /* If the function reads or writes globals, reset temp data. */
-+        p -= tcg_splitwx_diff;
++    flags = tcg_call_flags(op);
-+        if (!in_code_gen_buffer(p)) {
++    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-+            return NULL;
++        int nb_globals = s->nb_globals;
 +
 +        for (i = 0; i < nb_globals; i++) {
 +            if (test_bit(i, ctx->temps_used.l)) {
 +                reset_ts(&ctx->tcg->temps[i]);
 +            }
 +        }
 +    }
 +
-     if (p < region.start_aligned) {
++    /* Reset temp data for outputs. */
-         region_idx = 0;
++    for (i = 0; i < nb_oargs; i++) {
-     } else {
++        reset_temp(op->args[i]);
@@ -XXX,XX +XXX,XX @@ void tcg_tb_insert(TranslationBlock *tb)
  {
      struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
 +    g_assert(rt != NULL);
      qemu_mutex_lock(&rt->lock);
      g_tree_insert(rt->tree, &tb->tc, tb);
      qemu_mutex_unlock(&rt->lock);
@@ -XXX,XX +XXX,XX @@ void tcg_tb_remove(TranslationBlock *tb)
  {
      struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
 +    g_assert(rt != NULL);
      qemu_mutex_lock(&rt->lock);
      g_tree_remove(rt->tree, &tb->tc);
      qemu_mutex_unlock(&rt->lock);
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
      TranslationBlock *tb;
      struct tb_tc s = { .ptr = (void *)tc_ptr };
 +    if (rt == NULL) {
 +        return NULL;
 +    }
 +
-     qemu_mutex_lock(&rt->lock);
++    /* Stop optimizing MB across calls. */
-     tb = g_tree_lookup(rt->tree, &s);
++    ctx->prev_mb = NULL;
-     qemu_mutex_unlock(&rt->lock);
++    return true;
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
 -    int nb_temps, nb_globals, i;
 +    int nb_temps, i;
      TCGOp *op, *op_next;
      OptContext ctx = { .tcg = s };
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         available through the doubly linked circular list. */
      nb_temps = s->nb_temps;
 -    nb_globals = s->nb_globals;
 -
      for (i = 0; i < nb_temps; ++i) {
          s->temps[i].state_ptr = NULL;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          uint64_t z_mask, partmask, affected, tmp;
          int nb_oargs, nb_iargs;
          TCGOpcode opc = op->opc;
 -        const TCGOpDef *def = &tcg_op_defs[opc];
 +        const TCGOpDef *def;
 -        /* Count the arguments, and initialize the temps that are
 -           going to be used */
 +        /* Calls are special. */
          if (opc == INDEX_op_call) {
 -            nb_oargs = TCGOP_CALLO(op);
 -            nb_iargs = TCGOP_CALLI(op);
 -        } else {
 -            nb_oargs = def->nb_oargs;
 -            nb_iargs = def->nb_iargs;
 +            fold_call(&ctx, op);
 +            continue;
          }
 +
 +        def = &tcg_op_defs[opc];
 +        nb_oargs = def->nb_oargs;
 +        nb_iargs = def->nb_iargs;
          init_arguments(&ctx, op, nb_oargs + nb_iargs);
          copy_propagate(&ctx, op, nb_oargs, nb_iargs);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (def->flags & TCG_OPF_BB_END) {
              memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
          } else {
 -            if (opc == INDEX_op_call &&
 -                !(tcg_call_flags(op)
 -                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
 -                for (i = 0; i < nb_globals; i++) {
 -                    if (test_bit(i, ctx.temps_used.l)) {
 -                        reset_ts(&s->temps[i]);
 -                    }
 -                }
 -            }
 -
              for (i = 0; i < nb_oargs; i++) {
                  reset_temp(op->args[i]);
                  /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              case INDEX_op_qemu_st_i32:
              case INDEX_op_qemu_st8_i32:
              case INDEX_op_qemu_st_i64:
 -            case INDEX_op_call:
                  /* Opcodes that touch guest memory stop the optimization.  */
                  ctx.prev_mb = NULL;
                  break;
 --
 .25.1

-New patch
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
+Rather than try to keep these up-to-date across folding,
+re-read nb_oargs at the end, after re-reading the opcode.
+A couple of asserts need dropping, but that will take care
+of itself as we split the function further.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 14 ++++----------
+file changed, 4 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+         uint64_t z_mask, partmask, affected, tmp;
+-        int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         def = &tcg_op_defs[opc];
+-        nb_oargs = def->nb_oargs;
+-        nb_iargs = def->nb_iargs;
+-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
++        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
++        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(qemu_ld):
+             {
+-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
++                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+                 MemOp mop = get_memop(oi);
+                 if (!(mop & MO_SIGN)) {
+                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         if (partmask == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             continue;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+-                nb_iargs = 1;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+                                  ? INDEX_op_setcond_i32
+                                  : INDEX_op_setcond_i64);
+-                nb_iargs = 2;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
++            int nb_oargs = def->nb_oargs;
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+--
+.25.1

-[PATCH 14/27] tcg/tci: Remove tci_read_r16
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
-Use explicit casts for ext16u opcodes, and allow truncation
+Return -1 instead of 2 for failure, so that we can
-to happen with the store for st16 opcodes, and with the call
+use comparisons against 0 for all cases.
-for bswap16 opcodes.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 28 +++++++---------------------
+ tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
-file changed, 7 insertions(+), 21 deletions(-)
+file changed, 74 insertions(+), 71 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
      }
  }
- #endif
+-/* Return 2 if the condition can't be simplified, and the result
--static uint16_t tci_read_reg16(const tcg_target_ulong *regs, TCGReg index)
+-   of the condition (0 or 1) if it can */
--{
+-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
--    return (uint16_t)tci_read_reg(regs, index);
+-                                       TCGArg y, TCGCond c)
--}
++/*
--
++ * Return -1 if the condition can't be simplified,
- static uint32_t tci_read_reg32(const tcg_target_ulong *regs, TCGReg index)
++ * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +                                    TCGArg y, TCGCond c)
  {
-     return (uint32_t)tci_read_reg(regs, index);
+     uint64_t xv = arg_info(x)->val;
-@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     uint64_t yv = arg_info(y)->val;
-     return value;
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
          case TCG_COND_GEU:
              return 1;
          default:
 -            return 2;
 +            return -1;
          }
      }
 -    return 2;
 +    return -1;
  }
--/* Read indexed register (16 bit) from bytecode. */
+-/* Return 2 if the condition can't be simplified, and the result
--static uint16_t tci_read_r16(const tcg_target_ulong *regs,
+-   of the condition (0 or 1) if it can */
--                             const uint8_t **tb_ptr)
+-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--{
++/*
--    uint16_t value = tci_read_reg16(regs, **tb_ptr);
++ * Return -1 if the condition can't be simplified,
--    *tb_ptr += 1;
++ * and the result of the condition (0 or 1) if it can.
--    return value;
++ */
--}
++static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--
+ {
- #if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+     TCGArg al = p1[0], ah = p1[1];
- /* Read indexed register (16 bit signed) from bytecode. */
+     TCGArg bl = p2[0], bh = p2[1];
- static int16_t tci_read_r16s(const tcg_target_ulong *regs,
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
-             *(uint8_t *)(t1 + t2) = t0;
+         return do_constant_folding_cond_eq(c);
-             break;
+     }
-         CASE_32_64(st16)
+-    return 2;
--            t0 = tci_read_r16(regs, &tb_ptr);
++    return -1;
-+            t0 = tci_read_r(regs, &tb_ptr);
+ }
-             t1 = tci_read_r(regs, &tb_ptr);
-             t2 = tci_read_s32(&tb_ptr);
+ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
-             *(uint16_t *)(t1 + t2) = t0;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+             break;
- #if TCG_TARGET_HAS_ext16u_i32
-         case INDEX_op_ext16u_i32:
+         CASE_OP_32_64(setcond):
-             t0 = *tb_ptr++;
+-            tmp = do_constant_folding_cond(opc, op->args[1],
--            t1 = tci_read_r16(regs, &tb_ptr);
+-                                           op->args[2], op->args[3]);
--            tci_write_reg(regs, t0, t1);
+-            if (tmp != 2) {
-+            t1 = tci_read_r(regs, &tb_ptr);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-+            tci_write_reg(regs, t0, (uint16_t)t1);
++            i = do_constant_folding_cond(opc, op->args[1],
-             break;
++                                         op->args[2], op->args[3]);
- #endif
++            if (i >= 0) {
- #if TCG_TARGET_HAS_bswap16_i32
++                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-         case INDEX_op_bswap16_i32:
+                 continue;
-             t0 = *tb_ptr++;
+             }
--            t1 = tci_read_r16(regs, &tb_ptr);
+             break;
-+            t1 = tci_read_r(regs, &tb_ptr);
-             tci_write_reg(regs, t0, bswap16(t1));
+         CASE_OP_32_64(brcond):
-             break;
+-            tmp = do_constant_folding_cond(opc, op->args[0],
- #endif
+-                                           op->args[1], op->args[2]);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-            switch (tmp) {
- #if TCG_TARGET_HAS_ext16u_i64
+-            case 0:
-         case INDEX_op_ext16u_i64:
++            i = do_constant_folding_cond(opc, op->args[0],
-             t0 = *tb_ptr++;
++                                         op->args[1], op->args[2]);
--            t1 = tci_read_r16(regs, &tb_ptr);
++            if (i == 0) {
--            tci_write_reg(regs, t0, t1);
+                 tcg_op_remove(s, op);
-+            t1 = tci_read_r(regs, &tb_ptr);
+                 continue;
-+            tci_write_reg(regs, t0, (uint16_t)t1);
+-            case 1:
-             break;
++            } else if (i > 0) {
- #endif
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
- #if TCG_TARGET_HAS_ext32s_i64
+                 op->opc = opc = INDEX_op_br;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+                 op->args[0] = op->args[3];
- #if TCG_TARGET_HAS_bswap16_i64
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-         case INDEX_op_bswap16_i64:
+             break;
-             t0 = *tb_ptr++;
--            t1 = tci_read_r16(regs, &tb_ptr);
+         CASE_OP_32_64(movcond):
-+            t1 = tci_read_r(regs, &tb_ptr);
+-            tmp = do_constant_folding_cond(opc, op->args[1],
-             tci_write_reg(regs, t0, bswap16(t1));
+-                                           op->args[2], op->args[5]);
-             break;
+-            if (tmp != 2) {
- #endif
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[5]);
 +            if (i >= 0) {
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          case INDEX_op_brcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
 -                                            op->args[4]);
 -            if (tmp == 0) {
 +            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
 +                                          op->args[4]);
 +            if (i == 0) {
              do_brcond_false:
                  tcg_op_remove(s, op);
                  continue;
              }
 -            if (tmp == 1) {
 +            if (i > 0) {
              do_brcond_true:
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
              }
              break;
          case INDEX_op_setcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                            op->args[5]);
 -            if (tmp != 2) {
 +            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 +                                          op->args[5]);
 +            if (i >= 0) {
              do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_const;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
              }
 --
 .25.1

-[PATCH 15/27] tcg/tci: Remove tci_read_r16s
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
-Use explicit casts for ext16s opcodes.
+This will allow callers to tail call to these functions
 and return true indicating processing complete.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 26 ++++----------------------
+ tcg/optimize.c | 9 +++++----
-file changed, 4 insertions(+), 22 deletions(-)
+file changed, 5 insertions(+), 4 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-     return regs[index];
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
--#if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--static int16_t tci_read_reg16s(const tcg_target_ulong *regs, TCGReg index)
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 -{
 -    return (int16_t)tci_read_reg(regs, index);
 -}
 -#endif
 -
  #if TCG_TARGET_REG_BITS == 64
  static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
  {
-@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     TCGTemp *dst_ts = arg_temp(dst);
-     return value;
+     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      if (ts_are_copies(dst_ts, src_ts)) {
          tcg_op_remove(ctx->tcg, op);
 -        return;
 +        return true;
      }
      reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
          di->is_const = si->is_const;
          di->val = si->val;
      }
 +    return true;
  }
--#if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
--/* Read indexed register (16 bit signed) from bytecode. */
++static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
--static int16_t tci_read_r16s(const tcg_target_ulong *regs,
+                              TCGArg dst, uint64_t val)
--                             const uint8_t **tb_ptr)
+ {
--{
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
--    int16_t value = tci_read_reg16s(regs, **tb_ptr);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
--    *tb_ptr += 1;
+     /* Convert movi to mov with constant temp. */
--    return value;
+     tv = tcg_constant_internal(type, val);
--}
+     init_ts_info(ctx, tv);
--#endif
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
--
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
- /* Read indexed register (32 bit) from bytecode. */
+ }
- static uint32_t tci_read_r32(const tcg_target_ulong *regs,
-                              const uint8_t **tb_ptr)
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_ext16s_i32
          case INDEX_op_ext16s_i32:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r16s(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int16_t)t1);
              break;
  #endif
  #if TCG_TARGET_HAS_ext8u_i32
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #if TCG_TARGET_HAS_ext16s_i64
          case INDEX_op_ext16s_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r16s(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int16_t)t1);
              break;
  #endif
  #if TCG_TARGET_HAS_ext16u_i64
 --
 .25.1

-New patch
+[PULL 17/56] tcg/optimize: Split out finish_folding
+Copy z_mask into OptContext, for writeback to the
+first output within the new function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
+file changed, 33 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     TCGContext *tcg;
+     TCGOp *prev_mb;
+     TCGTempSet temps_used;
++
++    /* In flight values from optimization. */
++    uint64_t z_mask;
+ } OptContext;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static void finish_folding(OptContext *ctx, TCGOp *op)
++{
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
++    int i, nb_oargs;
++
++    /*
++     * For an opcode that ends a BB, reset all temp data.
++     * We do no cross-BB optimization.
++     */
++    if (def->flags & TCG_OPF_BB_END) {
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
++        ctx->prev_mb = NULL;
++        return;
++    }
++
++    nb_oargs = def->nb_oargs;
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++        /*
++         * Save the corresponding known-zero bits mask for the
++         * first output argument (only one supported so far).
++         */
++        if (i == 0) {
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
++        }
++    }
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             partmask &= 0xffffffffu;
+             affected &= 0xffffffffu;
+         }
++        ctx.z_mask = z_mask;
+         if (partmask == 0) {
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Some of the folding above can change opc. */
+-        opc = op->opc;
+-        def = &tcg_op_defs[opc];
+-        if (def->flags & TCG_OPF_BB_END) {
+-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-        } else {
+-            int nb_oargs = def->nb_oargs;
+-            for (i = 0; i < nb_oargs; i++) {
+-                reset_temp(op->args[i]);
+-                /* Save the corresponding known-zero bits mask for the
+-                   first output argument (only one supported so far). */
+-                if (i == 0) {
+-                    arg_info(op->args[i])->z_mask = z_mask;
+-                }
+-            }
+-        }
++        finish_folding(&ctx, op);
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-New patch
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++++++---
+file changed, 6 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
++        bool done = false;
+         /* Calls are special. */
+         if (opc == INDEX_op_call) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+            allocator where needed and possible.  Also detect copies. */
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
++            break;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        finish_folding(&ctx, op);
++        if (!done) {
++            finish_folding(&ctx, op);
++        }
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-[PATCH 19/27] tcg/tci: Merge basic arithmetic operations
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
-This includes add, sub, mul, and, or, xor.
+This puts the separate mb optimization into the same framework
 as the others.  While fold_qemu_{ld,st} are currently identical,
 that won't last as more code gets moved.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 83 +++++++++++++++++--------------------------------------
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
-file changed, 25 insertions(+), 58 deletions(-)
+file changed, 51 insertions(+), 38 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
-             *(uint32_t *)(t1 + t2) = t0;
+     return true;
  }
 +static bool fold_mb(OptContext *ctx, TCGOp *op)
 +{
 +    /* Eliminate duplicate and redundant fence instructions.  */
 +    if (ctx->prev_mb) {
 +        /*
 +         * Merge two barriers of the same type into one,
 +         * or a weaker barrier into a stronger one,
 +         * or two weaker barriers into a stronger one.
 +         *   mb X; mb Y => mb X|Y
 +         *   mb; strl => mb; st
 +         *   ldaq; mb => ld; mb
 +         *   ldaq; strl => ld; mb; st
 +         * Other combinations are also merged into a strong
 +         * barrier.  This is stricter than specified but for
 +         * the purposes of TCG is better than not optimizing.
 +         */
 +        ctx->prev_mb->args[0] |= op->args[0];
 +        tcg_op_remove(ctx->tcg, op);
 +    } else {
 +        ctx->prev_mb = op;
 +    }
 +    return true;
 +}
 +
 +static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* Opcodes that touch guest memory stop the mb optimization.  */
 +    ctx->prev_mb = NULL;
 +    return false;
 +}
 +
 +static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
 +{
 +    /* Opcodes that touch guest memory stop the mb optimization.  */
 +    ctx->prev_mb = NULL;
 +    return false;
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
--            /* Arithmetic operations (32 bit). */
++        case INDEX_op_mb:
-+            /* Arithmetic operations (mixed 32/64 bit). */
++            done = fold_mb(&ctx, op);
 -        case INDEX_op_add_i32:
 +        CASE_32_64(add)
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 + t2);
              break;
 -        case INDEX_op_sub_i32:
 +        CASE_32_64(sub)
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 - t2);
              break;
 -        case INDEX_op_mul_i32:
 +        CASE_32_64(mul)
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, t1 * t2);
              break;
 +        CASE_32_64(and)
 +            t0 = *tb_ptr++;
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, t1 & t2);
 +            break;
-+        CASE_32_64(or)
++        case INDEX_op_qemu_ld_i32:
-+            t0 = *tb_ptr++;
++        case INDEX_op_qemu_ld_i64:
-+            t1 = tci_read_r(regs, &tb_ptr);
++            done = fold_qemu_ld(&ctx, op);
 +            t2 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, t1 | t2);
 +            break;
-+        CASE_32_64(xor)
++        case INDEX_op_qemu_st_i32:
-+            t0 = *tb_ptr++;
++        case INDEX_op_qemu_st8_i32:
-+            t1 = tci_read_r(regs, &tb_ptr);
++        case INDEX_op_qemu_st_i64:
-+            t2 = tci_read_r(regs, &tb_ptr);
++            done = fold_qemu_st(&ctx, op);
 +            tci_write_reg(regs, t0, t1 ^ t2);
 +            break;
 +
-+            /* Arithmetic operations (32 bit). */
+         default:
 +
          case INDEX_op_div_i32:
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint32_t)t1 % (uint32_t)t2);
              break;
--        case INDEX_op_and_i32:
+         }
--            t0 = *tb_ptr++;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--            t1 = tci_read_r(regs, &tb_ptr);
+         if (!done) {
--            t2 = tci_read_r(regs, &tb_ptr);
+             finish_folding(&ctx, op);
--            tci_write_reg(regs, t0, t1 & t2);
+         }
--            break;
+-
--        case INDEX_op_or_i32:
+-        /* Eliminate duplicate and redundant fence instructions.  */
--            t0 = *tb_ptr++;
+-        if (ctx.prev_mb) {
--            t1 = tci_read_r(regs, &tb_ptr);
+-            switch (opc) {
--            t2 = tci_read_r(regs, &tb_ptr);
+-            case INDEX_op_mb:
--            tci_write_reg(regs, t0, t1 | t2);
+-                /* Merge two barriers of the same type into one,
--            break;
+-                 * or a weaker barrier into a stronger one,
--        case INDEX_op_xor_i32:
+-                 * or two weaker barriers into a stronger one.
--            t0 = *tb_ptr++;
+-                 *   mb X; mb Y => mb X|Y
--            t1 = tci_read_r(regs, &tb_ptr);
+-                 *   mb; strl => mb; st
--            t2 = tci_read_r(regs, &tb_ptr);
+-                 *   ldaq; mb => ld; mb
--            tci_write_reg(regs, t0, t1 ^ t2);
+-                 *   ldaq; strl => ld; mb; st
--            break;
+-                 * Other combinations are also merged into a strong
+-                 * barrier.  This is stricter than specified but for
-             /* Shift/rotate operations (32 bit). */
+-                 * the purposes of TCG is better than not optimizing.
+-                 */
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+-                ctx.prev_mb->args[0] |= op->args[0];
+-                tcg_op_remove(s, op);
-             /* Arithmetic operations (64 bit). */
+-                break;
+-
--        case INDEX_op_add_i64:
+-            default:
--            t0 = *tb_ptr++;
+-                /* Opcodes that end the block stop the optimization.  */
--            t1 = tci_read_r(regs, &tb_ptr);
+-                if ((def->flags & TCG_OPF_BB_END) == 0) {
--            t2 = tci_read_r(regs, &tb_ptr);
+-                    break;
--            tci_write_reg(regs, t0, t1 + t2);
+-                }
--            break;
+-                /* fallthru */
--        case INDEX_op_sub_i64:
+-            case INDEX_op_qemu_ld_i32:
--            t0 = *tb_ptr++;
+-            case INDEX_op_qemu_ld_i64:
--            t1 = tci_read_r(regs, &tb_ptr);
+-            case INDEX_op_qemu_st_i32:
--            t2 = tci_read_r(regs, &tb_ptr);
+-            case INDEX_op_qemu_st8_i32:
--            tci_write_reg(regs, t0, t1 - t2);
+-            case INDEX_op_qemu_st_i64:
--            break;
+-                /* Opcodes that touch guest memory stop the optimization.  */
--        case INDEX_op_mul_i64:
+-                ctx.prev_mb = NULL;
--            t0 = *tb_ptr++;
+-                break;
--            t1 = tci_read_r(regs, &tb_ptr);
+-            }
--            t2 = tci_read_r(regs, &tb_ptr);
+-        } else if (opc == INDEX_op_mb) {
--            tci_write_reg(regs, t0, t1 * t2);
+-            ctx.prev_mb = op;
--            break;
+-        }
-         case INDEX_op_div_i64:
+     }
-             t0 = *tb_ptr++;
+ }
              t1 = tci_read_r(regs, &tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t2 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint64_t)t1 % (uint64_t)t2);
              break;
 -        case INDEX_op_and_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            t2 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 & t2);
 -            break;
 -        case INDEX_op_or_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            t2 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 | t2);
 -            break;
 -        case INDEX_op_xor_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            t2 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1 ^ t2);
 -            break;
              /* Shift/rotate operations (64 bit). */
 --
 .25.1

-[PATCH 25/27] accel/tcg: drop the use of CF_HASH_MASK and rename params
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
-From: Alex Bennée <alex.bennee@linaro.org>
+Split out a whole bunch of placeholder functions, which are
+currently identical.  That won't last as more code gets moved.
-We don't really deal in cf_mask most of the time. The one time it's
-relevant is when we want to remove an invalidated TB from the QHT
+Use CASE_32_64_VEC for some logical operators that previously
-lookup. Everywhere else we should be looking up things without
+missed the addition of vectors.
-CF_INVALID set.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Message-Id: <20210224165811.11567-4-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h   |  4 +---
+ tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
- include/exec/tb-lookup.h  |  9 ++++++---
+file changed, 219 insertions(+), 52 deletions(-)
- accel/tcg/cpu-exec.c      | 16 ++++++++--------
- accel/tcg/tcg-runtime.c   |  2 +-
+diff --git a/tcg/optimize.c b/tcg/optimize.c
  accel/tcg/translate-all.c |  8 +++++---
 files changed, 21 insertions(+), 18 deletions(-)
 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
- #define CF_PARALLEL    0x00080000 /* Generate code for a parallel context */
+     }
- #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */
+ }
- #define CF_CLUSTER_SHIFT 24
--/* cflags' mask for hashing/comparison, basically ignore CF_INVALID */
++/*
--#define CF_HASH_MASK   (~CF_INVALID)
++ * The fold_* functions return true when processing is complete,
++ * usually by folding the operation to a constant or to a copy,
-     /* Per-vCPU dynamic tracing state used to generate this TB */
++ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
-     uint32_t trace_vcpu_dstate;
++ * like collect information about the value produced, for use in
-@@ -XXX,XX +XXX,XX @@ void tb_flush(CPUState *cpu);
++ * optimizing a subsequent operation.
- void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
++ *
- TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
++ * These first fold_* functions are all helpers, used by other
-                                    target_ulong cs_base, uint32_t flags,
++ * folders for more specific operations.
--                                   uint32_t cf_mask);
++ */
-+                                   uint32_t cflags);
++
- void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr);
++static bool fold_const1(OptContext *ctx, TCGOp *op)
++{
- /* GETPC is the true target of the return instruction that we'll execute.  */
++    if (arg_is_const(op->args[1])) {
-diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
++        uint64_t t;
-index XXXXXXX..XXXXXXX 100644
++
---- a/include/exec/tb-lookup.h
++        t = arg_info(op->args[1])->val;
-+++ b/include/exec/tb-lookup.h
++        t = do_constant_folding(op->opc, t, 0);
-@@ -XXX,XX +XXX,XX @@
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
- /* Might cause an exception, so have a longjmp destination ready */
++    }
- static inline TranslationBlock * tb_lookup(CPUState *cpu,
++    return false;
-                                            target_ulong pc, target_ulong cs_base,
++}
--                                           uint32_t flags, uint32_t cf_mask)
++
-+                                           uint32_t flags, uint32_t cflags)
++static bool fold_const2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t1 = arg_info(op->args[1])->val;
 +        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = do_constant_folding(op->opc, t1, t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
 +/*
 + * These outermost fold_<op> functions are sorted alphabetically.
 + */
 +
 +static bool fold_add(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
-     TranslationBlock *tb;
+     TCGContext *s = ctx->tcg;
-     uint32_t hash;
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
-+    /* we should never be trying to look up an INVALID tb */
+ }
-+    tcg_debug_assert(!(cflags & CF_INVALID));
-+
++static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-     hash = tb_jmp_cache_hash_func(pc);
++{
-     tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
++    return fold_const1(ctx, op);
++}
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock * tb_lookup(CPUState *cpu,
++
-                tb->cs_base == cs_base &&
++static bool fold_divide(OptContext *ctx, TCGOp *op)
-                tb->flags == flags &&
++{
-                tb->trace_vcpu_dstate == *cpu->trace_dstate &&
++    return fold_const2(ctx, op);
--               (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
++}
-+               tb_cflags(tb) == cflags)) {
++
-         return tb;
++static bool fold_eqv(OptContext *ctx, TCGOp *op)
-     }
++{
--    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
++    return fold_const2(ctx, op);
-+    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
++}
-     if (tb == NULL) {
++
-         return NULL;
++static bool fold_exts(OptContext *ctx, TCGOp *op)
-     }
++{
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
++    return fold_const1(ctx, op);
-index XXXXXXX..XXXXXXX 100644
++}
---- a/accel/tcg/cpu-exec.c
++
-+++ b/accel/tcg/cpu-exec.c
++static bool fold_extu(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ struct tb_desc {
++{
-     CPUArchState *env;
++    return fold_const1(ctx, op);
-     tb_page_addr_t phys_page1;
++}
-     uint32_t flags;
++
--    uint32_t cf_mask;
+ static bool fold_mb(OptContext *ctx, TCGOp *op)
 +    uint32_t cflags;
      uint32_t trace_vcpu_dstate;
  };
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
          tb->cs_base == desc->cs_base &&
          tb->flags == desc->flags &&
          tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
 -        (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == desc->cf_mask) {
 +        tb_cflags(tb) == desc->cflags) {
          /* check next page if needed */
          if (tb->page_addr[1] == -1) {
              return true;
@@ -XXX,XX +XXX,XX @@ static bool tb_lookup_cmp(const void *p, const void *d)
  TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
                                     target_ulong cs_base, uint32_t flags,
 -                                   uint32_t cf_mask)
 +                                   uint32_t cflags)
  {
-     tb_page_addr_t phys_pc;
+     /* Eliminate duplicate and redundant fence instructions.  */
-     struct tb_desc desc;
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
+     return true;
      desc.env = (CPUArchState *)cpu->env_ptr;
      desc.cs_base = cs_base;
      desc.flags = flags;
 -    desc.cf_mask = cf_mask;
 +    desc.cflags = cflags;
      desc.trace_vcpu_dstate = *cpu->trace_dstate;
      desc.pc = pc;
      phys_pc = get_page_addr_code(desc.env, pc);
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
          return NULL;
      }
      desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
 -    h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
 +    h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
      return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
  }
-@@ -XXX,XX +XXX,XX @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
++static bool fold_mul(OptContext *ctx, TCGOp *op)
++{
- static inline TranslationBlock *tb_find(CPUState *cpu,
++    return fold_const2(ctx, op);
-                                         TranslationBlock *last_tb,
++}
--                                        int tb_exit, uint32_t cf_mask)
++
-+                                        int tb_exit, uint32_t cflags)
++static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_nand(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_neg(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
-     CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+     /* Opcodes that touch guest memory stop the mb optimization.  */
-     TranslationBlock *tb;
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_find(CPUState *cpu,
+     return false;
+ }
-     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
++static bool fold_remainder(OptContext *ctx, TCGOp *op)
--    tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
++{
-+    tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
++    return fold_const2(ctx, op);
-     if (tb == NULL) {
++}
-         mmap_lock();
++
--        tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
++static bool fold_shift(OptContext *ctx, TCGOp *op)
-+        tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
++{
-         mmap_unlock();
++    return fold_const2(ctx, op);
-         /* We add the TB in the virtual pc hash table for the fast lookup */
++}
-         qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
++
-diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
++static bool fold_sub(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
++{
---- a/accel/tcg/tcg-runtime.c
++    return fold_const2(ctx, op);
-+++ b/accel/tcg/tcg-runtime.c
++}
-@@ -XXX,XX +XXX,XX @@
++
- #include "exec/helper-proto.h"
++static bool fold_xor(OptContext *ctx, TCGOp *op)
- #include "exec/cpu_ldst.h"
++{
- #include "exec/exec-all.h"
++    return fold_const2(ctx, op);
--#include "exec/tb-lookup.h"
++}
- #include "disas/disas.h"
++
- #include "exec/log.h"
+ /* Propagate constants and copies, fold constant expressions. */
- #include "tcg/tcg.h"
+ void tcg_optimize(TCGContext *s)
-+#include "exec/tb-lookup.h"
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- /* 32-bit helpers */
+             }
+             break;
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
-index XXXXXXX..XXXXXXX 100644
+-        CASE_OP_32_64(not):
---- a/accel/tcg/translate-all.c
+-        CASE_OP_32_64(neg):
-+++ b/accel/tcg/translate-all.c
+-        CASE_OP_32_64(ext8s):
-@@ -XXX,XX +XXX,XX @@ static bool tb_cmp(const void *ap, const void *bp)
+-        CASE_OP_32_64(ext8u):
-     return a->pc == b->pc &&
+-        CASE_OP_32_64(ext16s):
-         a->cs_base == b->cs_base &&
+-        CASE_OP_32_64(ext16u):
-         a->flags == b->flags &&
+-        CASE_OP_32_64(ctpop):
--        (tb_cflags(a) & CF_HASH_MASK) == (tb_cflags(b) & CF_HASH_MASK) &&
+-        case INDEX_op_ext32s_i64:
-+        (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
+-        case INDEX_op_ext32u_i64:
-         a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
+-        case INDEX_op_ext_i32_i64:
-         a->page_addr[0] == b->page_addr[0] &&
+-        case INDEX_op_extu_i32_i64:
-         a->page_addr[1] == b->page_addr[1];
+-        case INDEX_op_extrl_i64_i32:
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+-        case INDEX_op_extrh_i64_i32:
-     PageDesc *p;
+-            if (arg_is_const(op->args[1])) {
-     uint32_t h;
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-     tb_page_addr_t phys_pc;
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-+    uint32_t orig_cflags = tb_cflags(tb);
+-                continue;
+-            }
-     assert_memory_lock();
+-            break;
+-
-@@ -XXX,XX +XXX,XX @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
-     /* remove the TB from the hash list */
+         case INDEX_op_bswap64_i64:
-     phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb_cflags(tb) & CF_HASH_MASK,
+             }
-+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
+             break;
-                      tb->trace_vcpu_dstate);
-     if (!qht_remove(&tb_ctx.htable, tb, h)) {
+-        CASE_OP_32_64(add):
-         return;
+-        CASE_OP_32_64(sub):
-@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
+-        CASE_OP_32_64(mul):
-     uint32_t h;
+-        CASE_OP_32_64(or):
+-        CASE_OP_32_64(and):
-     assert_memory_lock();
+-        CASE_OP_32_64(xor):
-+    tcg_debug_assert(!(tb->cflags & CF_INVALID));
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
-     /*
+-        CASE_OP_32_64(sar):
-      * Add the TB to the page list, acquiring first the pages's locks.
+-        CASE_OP_32_64(rotl):
-@@ -XXX,XX +XXX,XX @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
+-        CASE_OP_32_64(rotr):
-     }
+-        CASE_OP_32_64(andc):
+-        CASE_OP_32_64(orc):
-     /* add in the hash table */
+-        CASE_OP_32_64(eqv):
--    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
+-        CASE_OP_32_64(nand):
-+    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags,
+-        CASE_OP_32_64(nor):
-                      tb->trace_vcpu_dstate);
+-        CASE_OP_32_64(muluh):
-     qht_insert(&tb_ctx.htable, tb, h, &existing_tb);
+-        CASE_OP_32_64(mulsh):
 -        CASE_OP_32_64(div):
 -        CASE_OP_32_64(divu):
 -        CASE_OP_32_64(rem):
 -        CASE_OP_32_64(remu):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
 -                                          arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 +        default:
 +            break;
 +
 +        /* ---------------------------------------------------------- */
 +        /* Sorted alphabetically by opcode as much as possible. */
 +
 +        CASE_OP_32_64_VEC(add):
 +            done = fold_add(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(and):
 +            done = fold_and(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(andc):
 +            done = fold_andc(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ctpop):
 +            done = fold_ctpop(&ctx, op);
 +            break;
 +        CASE_OP_32_64(div):
 +        CASE_OP_32_64(divu):
 +            done = fold_divide(&ctx, op);
 +            break;
 +        CASE_OP_32_64(eqv):
 +            done = fold_eqv(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8s):
 +        CASE_OP_32_64(ext16s):
 +        case INDEX_op_ext32s_i64:
 +        case INDEX_op_ext_i32_i64:
 +            done = fold_exts(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8u):
 +        CASE_OP_32_64(ext16u):
 +        case INDEX_op_ext32u_i64:
 +        case INDEX_op_extu_i32_i64:
 +        case INDEX_op_extrl_i64_i32:
 +        case INDEX_op_extrh_i64_i32:
 +            done = fold_extu(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64(mul):
 +            done = fold_mul(&ctx, op);
 +            break;
 +        CASE_OP_32_64(mulsh):
 +        CASE_OP_32_64(muluh):
 +            done = fold_mul_highpart(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nand):
 +            done = fold_nand(&ctx, op);
 +            break;
 +        CASE_OP_32_64(neg):
 +            done = fold_neg(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nor):
 +            done = fold_nor(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(not):
 +            done = fold_not(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(or):
 +            done = fold_or(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(orc):
 +            done = fold_orc(&ctx, op);
 +            break;
          case INDEX_op_qemu_ld_i32:
          case INDEX_op_qemu_ld_i64:
              done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_qemu_st_i64:
              done = fold_qemu_st(&ctx, op);
              break;
 -
 -        default:
 +        CASE_OP_32_64(rem):
 +        CASE_OP_32_64(remu):
 +            done = fold_remainder(&ctx, op);
 +            break;
 +        CASE_OP_32_64(rotl):
 +        CASE_OP_32_64(rotr):
 +        CASE_OP_32_64(sar):
 +        CASE_OP_32_64(shl):
 +        CASE_OP_32_64(shr):
 +            done = fold_shift(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(sub):
 +            done = fold_sub(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(xor):
 +            done = fold_xor(&ctx, op);
              break;
          }
 --
 .25.1

-New patch
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
+file changed, 72 insertions(+), 73 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_setcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
++            goto do_setcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            goto do_setcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
++                                     op->args[4], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            op->args[2] = op->args[3];
++            op->args[3] = cond;
++            op->opc = INDEX_op_setcond_i32;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_setcond_high:
++        op->args[1] = op->args[2];
++        op->args[2] = op->args[4];
++        op->args[3] = cond;
++        op->opc = INDEX_op_setcond_i32;
++        break;
++    }
++    return false;
++
++ do_setcond_const:
++    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_setcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+-                                          op->args[5]);
+-            if (i >= 0) {
+-            do_setcond_const:
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0
+-                 && arg_is_const(op->args[4])
+-                 && arg_info(op->args[4])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_setcond_high:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_const;
+-                } else if (i > 0) {
+-                    goto do_setcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_setcond_low:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[2] = op->args[3];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_low;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        case INDEX_op_setcond2_i32:
++            done = fold_setcond2(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
+file changed, 81 insertions(+), 78 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[4];
++    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
++    TCGArg label = op->args[5];
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_brcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
++            goto do_brcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
++                                     op->args[2], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            goto do_brcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            op->opc = INDEX_op_brcond_i32;
++            op->args[1] = op->args[2];
++            op->args[2] = cond;
++            op->args[3] = label;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_brcond_high:
++        op->opc = INDEX_op_brcond_i32;
++        op->args[0] = op->args[1];
++        op->args[1] = op->args[3];
++        op->args[2] = cond;
++        op->args[3] = label;
++        break;
++
++    do_brcond_const:
++        if (i == 0) {
++            tcg_op_remove(ctx->tcg, op);
++            return true;
++        }
++        op->opc = INDEX_op_br;
++        op->args[0] = label;
++        break;
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_brcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                          op->args[4]);
+-            if (i == 0) {
+-            do_brcond_false:
+-                tcg_op_remove(s, op);
+-                continue;
+-            }
+-            if (i > 0) {
+-            do_brcond_true:
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[5];
+-                break;
+-            }
+-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+-                 && arg_is_const(op->args[2])
+-                 && arg_info(op->args[2])->val == 0
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_brcond_high:
+-                op->opc = opc = INDEX_op_brcond_i32;
+-                op->args[0] = op->args[1];
+-                op->args[1] = op->args[3];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i > 0) {
+-                    goto do_brcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_brcond_low:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_high;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_low;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        case INDEX_op_brcond2_i32:
++            done = fold_brcond2(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
+file changed, 21 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
++        uint32_t a = arg_info(op->args[2])->val;
++        uint32_t b = arg_info(op->args[3])->val;
++        uint64_t r = (uint64_t)a * b;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++        return true;
++    }
++    return false;
++}
++
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_mulu2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-                uint32_t a = arg_info(op->args[2])->val;
+-                uint32_t b = arg_info(op->args[3])->val;
+-                uint64_t r = (uint64_t)a * b;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
++        case INDEX_op_mulu2_i32:
++            done = fold_mulu2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
+which will not be simple wrappers forever.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
+file changed, 44 insertions(+), 26 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
++        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
++        uint32_t al = arg_info(op->args[2])->val;
++        uint32_t ah = arg_info(op->args[3])->val;
++        uint32_t bl = arg_info(op->args[4])->val;
++        uint32_t bh = arg_info(op->args[5])->val;
++        uint64_t a = ((uint64_t)ah << 32) | al;
++        uint64_t b = ((uint64_t)bh << 32) | bl;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        if (add) {
++            a += b;
++        } else {
++            a -= b;
++        }
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
++        return true;
++    }
++    return false;
++}
++
++static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, true);
++}
++
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, false);
++}
++
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_add2_i32:
+-        case INDEX_op_sub2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+-                uint32_t al = arg_info(op->args[2])->val;
+-                uint32_t ah = arg_info(op->args[3])->val;
+-                uint32_t bl = arg_info(op->args[4])->val;
+-                uint32_t bh = arg_info(op->args[5])->val;
+-                uint64_t a = ((uint64_t)ah << 32) | al;
+-                uint64_t b = ((uint64_t)bh << 32) | bl;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                if (opc == INDEX_op_add2_i32) {
+-                    a += b;
+-                } else {
+-                    a -= b;
+-                }
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
+-                continue;
+-            }
+-            break;
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
++        case INDEX_op_add2_i32:
++            done = fold_add2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
++        case INDEX_op_sub2_i32:
++            done = fold_sub2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 27/56] tcg/optimize: Split out fold_movcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
+file changed, 31 insertions(+), 25 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_movcond(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode opc = op->opc;
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
++    }
++
++    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
++        uint64_t tv = arg_info(op->args[3])->val;
++        uint64_t fv = arg_info(op->args[4])->val;
++
++        opc = (opc == INDEX_op_movcond_i32
++               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
++
++        if (tv == 1 && fv == 0) {
++            op->opc = opc;
++            op->args[3] = cond;
++        } else if (fv == 1 && tv == 0) {
++            op->opc = opc;
++            op->args[3] = tcg_invert_cond(cond);
++        }
++    }
++    return false;
++}
++
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(movcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[5]);
+-            if (i >= 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
+-                continue;
+-            }
+-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-                uint64_t tv = arg_info(op->args[3])->val;
+-                uint64_t fv = arg_info(op->args[4])->val;
+-                TCGCond cond = op->args[5];
+-
+-                if (fv == 1 && tv == 0) {
+-                    cond = tcg_invert_cond(cond);
+-                } else if (!(tv == 1 && fv == 0)) {
+-                    break;
+-                }
+-                op->args[3] = cond;
+-                op->opc = opc = (opc == INDEX_op_movcond_i32
+-                                 ? INDEX_op_setcond_i32
+-                                 : INDEX_op_setcond_i64);
+-            }
+-            break;
+-
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(movcond):
++            done = fold_movcond(&ctx, op);
++            break;
+         CASE_OP_32_64(mul):
+             done = fold_mul(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 28/56] tcg/optimize: Split out fold_extract2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
+file changed, 22 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t v1 = arg_info(op->args[1])->val;
++        uint64_t v2 = arg_info(op->args[2])->val;
++        int shr = op->args[3];
++
++        if (op->opc == INDEX_op_extract2_i64) {
++            v1 >>= shr;
++            v2 <<= 64 - shr;
++        } else {
++            v1 = (uint32_t)v1 >> shr;
++            v2 = (int32_t)v2 << (32 - shr);
++        }
++        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
++    }
++    return false;
++}
++
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract2):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                uint64_t v1 = arg_info(op->args[1])->val;
+-                uint64_t v2 = arg_info(op->args[2])->val;
+-                int shr = op->args[3];
+-
+-                if (opc == INDEX_op_extract2_i64) {
+-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
+-                } else {
+-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+-                                    ((uint32_t)v2 << (32 - shr)));
+-                }
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract2):
++            done = fold_extract2(&ctx, op);
++            break;
+         CASE_OP_32_64(ext8s):
+         CASE_OP_32_64(ext16s):
+         case INDEX_op_ext32s_i64:
+--
+.25.1

-New patch
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
+file changed, 30 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = extract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ {
+     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+ }
++static bool fold_sextract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = sextract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = extract64(arg_info(op->args[1])->val,
+-                                op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        CASE_OP_32_64(sextract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = sextract64(arg_info(op->args[1])->val,
+-                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract):
++            done = fold_extract(&ctx, op);
++            break;
+         CASE_OP_32_64(extract2):
+             done = fold_extract2(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(sextract):
++            done = fold_sextract(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 30/56] tcg/optimize: Split out fold_deposit
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 25 +++++++++++++++----------
+file changed, 15 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+     return fold_const1(ctx, op);
+ }
++static bool fold_deposit(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t1 = arg_info(op->args[1])->val;
++        uint64_t t2 = arg_info(op->args[2])->val;
++
++        t1 = deposit64(t1, op->args[3], op->args[4], t2);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
++    }
++    return false;
++}
++
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(deposit):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tmp = deposit64(arg_info(op->args[1])->val,
+-                                op->args[3], op->args[4],
+-                                arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
++        CASE_OP_32_64(deposit):
++            done = fold_deposit(&ctx, op);
++            break;
+         CASE_OP_32_64(div):
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+--
+.25.1

-New patch
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
+file changed, 18 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        if (t != 0) {
++            t = do_constant_folding(op->opc, t, 0);
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++        }
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
++    }
++    return false;
++}
++
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(clz):
+-        CASE_OP_32_64(ctz):
+-            if (arg_is_const(op->args[1])) {
+-                TCGArg v = arg_info(op->args[1])->val;
+-                if (v != 0) {
+-                    tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                } else {
+-                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
+-                }
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(clz):
++        CASE_OP_32_64(ctz):
++            done = fold_count_zeros(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 32/56] tcg/optimize: Split out fold_bswap
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++++-----------
+file changed, 16 insertions(+), 11 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++static bool fold_bswap(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        t = do_constant_folding(op->opc, t, op->args[2]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(bswap16):
+-        CASE_OP_32_64(bswap32):
+-        case INDEX_op_bswap64_i64:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          op->args[2]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(bswap16):
++        CASE_OP_32_64(bswap32):
++        case INDEX_op_bswap64_i64:
++            done = fold_bswap(&ctx, op);
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             done = fold_count_zeros(&ctx, op);
+--
+.25.1

-New patch
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
+file changed, 31 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_dup(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++        t = dup_const(TCGOP_VECE(op), t);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
++static bool fold_dup2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
++                               arg_info(op->args[2])->val);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++
++    if (args_are_copies(op->args[1], op->args[2])) {
++        op->opc = INDEX_op_dup_vec;
++        TCGOP_VECE(op) = MO_32;
++    }
++    return false;
++}
++
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             break;
+-        case INDEX_op_dup_vec:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = arg_info(op->args[1])->val;
+-                tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        case INDEX_op_dup2_vec:
+-            assert(TCG_TARGET_REG_BITS == 32);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0],
+-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+-                                           arg_info(op->args[2])->val));
+-                continue;
+-            } else if (args_are_copies(op->args[1], op->args[2])) {
+-                op->opc = INDEX_op_dup_vec;
+-                TCGOP_VECE(op) = MO_32;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+             break;
++        case INDEX_op_dup_vec:
++            done = fold_dup(&ctx, op);
++            break;
++        case INDEX_op_dup2_vec:
++            done = fold_dup2(&ctx, op);
++            break;
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 34/56] tcg/optimize: Split out fold_mov
+This is the final entry in the main switch that was in a
+different form.  After this, we have the option to convert
+the switch into a function dispatch table.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++-------------
+file changed, 14 insertions(+), 13 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mov(OptContext *ctx, TCGOp *op)
++{
++    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++}
++
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ {
+     TCGOpcode opc = op->opc;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Propagate constants through copy operations and do constant
+-           folding.  Constants will be substituted to arguments by register
+-           allocator where needed and possible.  Also detect copies. */
++        /*
++         * Process each opcode.
++         * Sorted alphabetically by opcode as much as possible.
++         */
+         switch (opc) {
+-        CASE_OP_32_64_VEC(mov):
+-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            break;
+-
+-        default:
+-            break;
+-
+-        /* ---------------------------------------------------------- */
+-        /* Sorted alphabetically by opcode as much as possible. */
+-
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64_VEC(mov):
++            done = fold_mov(&ctx, op);
++            break;
+         CASE_OP_32_64(movcond):
+             done = fold_movcond(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
++        default:
++            break;
+         }
+         if (!done) {
+--
+.25.1

-New patch
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
+Pull the "op r, a, a => movi r, 0" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
+file changed, 24 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++/* If the binary operation has both arguments equal, fold to @i. */
++static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (args_are_copies(op->args[1], op->args[2])) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /*
+  * These outermost fold_<op> functions are sorted alphabetically.
+  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ /* Propagate constants and copies, fold constant expressions. */
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(andc):
+-        CASE_OP_32_64_VEC(sub):
+-        CASE_OP_32_64_VEC(xor):
+-            if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-New patch
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
+Pull the "op r, a, a => mov r, a" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
+file changed, 24 insertions(+), 15 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+     return false;
+ }
++/* If the binary operation has both arguments equal, fold to identity. */
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
++{
++    if (args_are_copies(op->args[1], op->args[2])) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++    }
++    return false;
++}
++
+ /*
+  * These outermost fold_<op> functions are sorted alphabetically.
++ *
++ * The ordering of the transformations should be:
++ *   1) those that produce a constant
++ *   2) those that produce a copy
++ *   3) those that produce information about the result value.
+  */
+ static bool fold_add(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_x(ctx, op)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_x(ctx, op)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(or):
+-        CASE_OP_32_64_VEC(and):
+-            if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-New patch
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
+file changed, 20 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++/* If the binary operation has second argument @i, fold to @i. */
++static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /* If the binary operation has both arguments equal, fold to @i. */
+ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             continue;
+         }
+-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(and):
+-        CASE_OP_32_64_VEC(mul):
+-        CASE_OP_32_64(muluh):
+-        CASE_OP_32_64(mulsh):
+-            if (arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-[PATCH 26/27] include/exec: lightly re-arrange TranslationBlock
+[PULL 38/56] tcg/optimize: Add type to OptContext
-From: Alex Bennée <alex.bennee@linaro.org>
+Compute the type of the operation early.
-Lets make sure all the flags we compare when looking up blocks are
+There are at least 4 places that used a def->flags ladder
-together in the same place.
+to determine the type of the operation being optimized.
-Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
+There were two places that assumed !TCG_OPF_64BIT means
-Message-Id: <20210224165811.11567-5-alex.bennee@linaro.org>
+TCG_TYPE_I32, and so could potentially compute incorrect
 results for vector operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h | 8 +++++---
+ tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
-file changed, 5 insertions(+), 3 deletions(-)
+file changed, 89 insertions(+), 60 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-     target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
-     target_ulong cs_base; /* CS base for this block */
+     /* In flight values from optimization. */
-     uint32_t flags; /* flags defining in which context the code was generated */
+     uint64_t z_mask;
--    uint16_t size;      /* size of target code for this block (1 <=
++    TCGType type;
--                           size <= TARGET_PAGE_SIZE) */
+ } OptContext;
--    uint16_t icount;
-     uint32_t cflags;    /* compile flags */
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
- #define CF_COUNT_MASK  0x00007fff
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
- #define CF_LAST_IO     0x00008000 /* Last insn may be an IO access.  */
+ {
-@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
+     TCGTemp *dst_ts = arg_temp(dst);
-     /* Per-vCPU dynamic tracing state used to generate this TB */
+     TCGTemp *src_ts = arg_temp(src);
-     uint32_t trace_vcpu_dstate;
+-    const TCGOpDef *def;
+     TempOptInfo *di;
-+    /* Above fields used for comparing */
+     TempOptInfo *si;
-+    uint16_t size;      /* size of target code for this block (1 <=
+     uint64_t z_mask;
-+                           size <= TARGET_PAGE_SIZE) */
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-+    uint16_t icount;
+     reset_ts(dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_mov_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_mov_i64;
 -    } else {
 +
-     struct tb_tc tc;
++    switch (ctx->type) {
++    case TCG_TYPE_I32:
-     /* first and second physical page containing code. The lower bit
+         new_op = INDEX_op_mov_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        new_op = INDEX_op_mov_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
      op->args[0] = dst;
      op->args[1] = src;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      }
  }
 -static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 +static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
 +                                    uint64_t x, uint64_t y)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op];
      uint64_t res = do_constant_folding_2(op, x, y);
 -    if (!(def->flags & TCG_OPF_64BIT)) {
 +    if (type == TCG_TYPE_I32) {
          res = (int32_t)res;
      }
      return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
   * Return -1 if the condition can't be simplified,
   * and the result of the condition (0 or 1) if it can.
   */
 -static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
      if (arg_is_const(x) && arg_is_const(y)) {
 -        const TCGOpDef *def = &tcg_op_defs[op];
 -        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
 -        if (def->flags & TCG_OPF_64BIT) {
 -            return do_constant_folding_cond_64(xv, yv, c);
 -        } else {
 +        switch (type) {
 +        case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
 +        case TCG_TYPE_I64:
 +            return do_constant_folding_cond_64(xv, yv, c);
 +        default:
 +            /* Only scalar comparisons are optimizable */
 +            return -1;
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, 0);
 +        t = do_constant_folding(op->opc, ctx->type, t, 0);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
 -        t1 = do_constant_folding(op->opc, t1, t2);
 +        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
 +        case TCG_TYPE_I64:
 +            opc = INDEX_op_setcond_i64;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          if (tv == 1 && fv == 0) {
              op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
          /* For commutative operations make constant second argument */
          switch (opc) {
          CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Proceed with possible constant folding. */
                      break;
                  }
 -                if (opc == INDEX_op_sub_i32) {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      neg_op = INDEX_op_neg_i32;
                      have_neg = TCG_TARGET_HAS_neg_i32;
 -                } else if (opc == INDEX_op_sub_i64) {
 +                    break;
 +                case TCG_TYPE_I64:
                      neg_op = INDEX_op_neg_i64;
                      have_neg = TCG_TARGET_HAS_neg_i64;
 -                } else if (TCG_TARGET_HAS_neg_vec) {
 -                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -                    unsigned vece = TCGOP_VECE(op);
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
 -                } else {
                      break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    neg_op = INDEX_op_neg_vec;
 +                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 +                                                   TCGOP_VECE(op)) > 0;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_neg) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGOpcode not_op;
                  bool have_not;
 -                if (def->flags & TCG_OPF_VECTOR) {
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-[PATCH 23/27] accel/tcg: rename tb_lookup__cpu_state and hoist state extraction
+[PULL 39/56] tcg/optimize: Split out fold_to_not
-From: Alex Bennée <alex.bennee@linaro.org>
+Split out the conditional conversion from a more complex logical
+operation to a simple NOT.  Create a couple more helpers to make
-Having a function return either and valid TB and some system state
+this easy for the outer-most logical operations.
-seems excessive. It will make the subsequent re-factoring easier if we
-lookup the current state where we are.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210224165811.11567-2-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/tb-lookup.h | 18 ++++++++----------
+ tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
- accel/tcg/cpu-exec.c     | 10 ++++++++--
+file changed, 86 insertions(+), 72 deletions(-)
- accel/tcg/tcg-runtime.c  |  4 +++-
-files changed, 19 insertions(+), 13 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/tb-lookup.h
+--- a/tcg/optimize.c
-+++ b/include/exec/tb-lookup.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
- #include "exec/tb-hash.h"
+     return false;
+ }
- /* Might cause an exception, so have a longjmp destination ready */
--static inline TranslationBlock *
++/*
--tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
++ * Convert @op to NOT, if NOT is supported by the host.
--                     uint32_t *flags, uint32_t cf_mask)
++ * Return true f the conversion is successful, which will still
-+static inline TranslationBlock * tb_lookup(CPUState *cpu,
++ * indicate that the processing is complete.
-+                                           target_ulong pc, target_ulong cs_base,
++ */
-+                                           uint32_t flags, uint32_t cf_mask)
++static bool fold_not(OptContext *ctx, TCGOp *op);
- {
++static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
--    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
++{
-     TranslationBlock *tb;
++    TCGOpcode not_op;
-     uint32_t hash;
++    bool have_not;
++
--    cpu_get_tb_cpu_state(env, pc, cs_base, flags);
++    switch (ctx->type) {
--    hash = tb_jmp_cache_hash_func(*pc);
++    case TCG_TYPE_I32:
-+    hash = tb_jmp_cache_hash_func(pc);
++        not_op = INDEX_op_not_i32;
-     tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
++        have_not = TCG_TARGET_HAS_not_i32;
++        break;
-     cf_mask &= ~CF_CLUSTER_MASK;
++    case TCG_TYPE_I64:
-     cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;
++        not_op = INDEX_op_not_i64;
++        have_not = TCG_TARGET_HAS_not_i64;
-     if (likely(tb &&
++        break;
--               tb->pc == *pc &&
++    case TCG_TYPE_V64:
--               tb->cs_base == *cs_base &&
++    case TCG_TYPE_V128:
--               tb->flags == *flags &&
++    case TCG_TYPE_V256:
-+               tb->pc == pc &&
++        not_op = INDEX_op_not_vec;
-+               tb->cs_base == cs_base &&
++        have_not = TCG_TARGET_HAS_not_vec;
-+               tb->flags == flags &&
++        break;
-                tb->trace_vcpu_dstate == *cpu->trace_dstate &&
++    default:
-                (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
++        g_assert_not_reached();
-         return tb;
++    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
--    tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask);
+     return false;
-+    tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
-     if (tb == NULL) {
-         return NULL;
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+     return false;
-index XXXXXXX..XXXXXXX 100644
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
---- a/accel/tcg/cpu-exec.c
+                 }
-+++ b/accel/tcg/cpu-exec.c
+             }
-@@ -XXX,XX +XXX,XX @@ static void cpu_exec_exit(CPUState *cpu)
+             break;
+-        CASE_OP_32_64_VEC(xor):
- void cpu_exec_step_atomic(CPUState *cpu)
+-        CASE_OP_32_64(nand):
- {
+-            if (!arg_is_const(op->args[1])
-+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+-                && arg_is_const(op->args[2])
-     TranslationBlock *tb;
+-                && arg_info(op->args[2])->val == -1) {
-     target_ulong cs_base, pc;
+-                i = 1;
-     uint32_t flags;
+-                goto try_not;
-@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
+-            }
-         g_assert(!cpu->running);
+-            break;
-         cpu->running = true;
+-        CASE_OP_32_64(nor):
+-            if (!arg_is_const(op->args[1])
--        tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
+-                && arg_is_const(op->args[2])
-+        cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+-                && arg_info(op->args[2])->val == 0) {
-+        tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
+-                i = 1;
-+
+-                goto try_not;
-         if (tb == NULL) {
+-            }
-             mmap_lock();
+-            break;
-             tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
+-        CASE_OP_32_64_VEC(andc):
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock *tb_find(CPUState *cpu,
+-            if (!arg_is_const(op->args[2])
-                                         TranslationBlock *last_tb,
+-                && arg_is_const(op->args[1])
-                                         int tb_exit, uint32_t cf_mask)
+-                && arg_info(op->args[1])->val == -1) {
- {
+-                i = 2;
-+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+-                goto try_not;
-     TranslationBlock *tb;
+-            }
-     target_ulong cs_base, pc;
+-            break;
-     uint32_t flags;
+-        CASE_OP_32_64_VEC(orc):
+-        CASE_OP_32_64(eqv):
--    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
+-            if (!arg_is_const(op->args[2])
-+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+-                && arg_is_const(op->args[1])
-+
+-                && arg_info(op->args[1])->val == 0) {
-+    tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
+-                i = 2;
-     if (tb == NULL) {
+-                goto try_not;
-         mmap_lock();
+-            }
-         tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
+-            break;
-diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
+-        try_not:
-index XXXXXXX..XXXXXXX 100644
+-            {
---- a/accel/tcg/tcg-runtime.c
+-                TCGOpcode not_op;
-+++ b/accel/tcg/tcg-runtime.c
+-                bool have_not;
-@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
+-
-     target_ulong cs_base, pc;
+-                switch (ctx.type) {
-     uint32_t flags;
+-                case TCG_TYPE_I32:
+-                    not_op = INDEX_op_not_i32;
--    tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags());
+-                    have_not = TCG_TARGET_HAS_not_i32;
-+    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
+-                    break;
-+
+-                case TCG_TYPE_I64:
-+    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags());
+-                    not_op = INDEX_op_not_i64;
-     if (tb == NULL) {
+-                    have_not = TCG_TARGET_HAS_not_i64;
-         return tcg_code_gen_epilogue;
+-                    break;
-     }
+-                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
          }
 --
 .25.1

-[PATCH 20/27] tcg/tci: Merge extension operations
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
-This includes ext8s, ext8u, ext16s, ext16u.
+Even though there is only one user, place this more complex
 conversion into its own helper.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 44 ++++++++------------------------------------
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
-file changed, 8 insertions(+), 36 deletions(-)
+file changed, 47 insertions(+), 42 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
-             tci_write_reg64(regs, t1, t0, (uint32_t)t2 * tmp64);
-             break;
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
- #endif /* TCG_TARGET_REG_BITS == 32 */
+ {
--#if TCG_TARGET_HAS_ext8s_i32
+-    return fold_const1(ctx, op);
--        case INDEX_op_ext8s_i32:
++    if (fold_const1(ctx, op)) {
-+#if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
++        return true;
-+        CASE_32_64(ext8s)
++    }
-             t0 = *tb_ptr++;
++    /*
-             t1 = tci_read_r(regs, &tb_ptr);
++     * Because of fold_sub_to_neg, we want to always return true,
-             tci_write_reg(regs, t0, (int8_t)t1);
++     * via finish_folding.
-             break;
++     */
- #endif
++    finish_folding(ctx, op);
--#if TCG_TARGET_HAS_ext16s_i32
++    return true;
--        case INDEX_op_ext16s_i32:
+ }
-+#if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
-+        CASE_32_64(ext16s)
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
-             t0 = *tb_ptr++;
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-             t1 = tci_read_r(regs, &tb_ptr);
+     return fold_const2(ctx, op);
-             tci_write_reg(regs, t0, (int16_t)t1);
+ }
-             break;
- #endif
++static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
--#if TCG_TARGET_HAS_ext8u_i32
++{
--        case INDEX_op_ext8u_i32:
++    TCGOpcode neg_op;
-+#if TCG_TARGET_HAS_ext8u_i32 || TCG_TARGET_HAS_ext8u_i64
++    bool have_neg;
-+        CASE_32_64(ext8u)
++
-             t0 = *tb_ptr++;
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
-             t1 = tci_read_r(regs, &tb_ptr);
++        return false;
-             tci_write_reg(regs, t0, (uint8_t)t1);
++    }
-             break;
++
- #endif
++    switch (ctx->type) {
--#if TCG_TARGET_HAS_ext16u_i32
++    case TCG_TYPE_I32:
--        case INDEX_op_ext16u_i32:
++        neg_op = INDEX_op_neg_i32;
-+#if TCG_TARGET_HAS_ext16u_i32 || TCG_TARGET_HAS_ext16u_i64
++        have_neg = TCG_TARGET_HAS_neg_i32;
-+        CASE_32_64(ext16u)
++        break;
-             t0 = *tb_ptr++;
++    case TCG_TYPE_I64:
-             t1 = tci_read_r(regs, &tb_ptr);
++        neg_op = INDEX_op_neg_i64;
-             tci_write_reg(regs, t0, (uint16_t)t1);
++        have_neg = TCG_TARGET_HAS_neg_i64;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        neg_op = INDEX_op_neg_vec;
 +        have_neg = (TCG_TARGET_HAS_neg_vec &&
 +                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_neg) {
 +        op->opc = neg_op;
 +        op->args[1] = op->args[2];
 +        return fold_neg(ctx, op);
 +    }
 +    return false;
 +}
 +
  static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_sub_to_neg(ctx, op)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  continue;
              }
              break;
--#if TCG_TARGET_HAS_ext8u_i64
+-        CASE_OP_32_64_VEC(sub):
--        case INDEX_op_ext8u_i64:
+-            {
--            t0 = *tb_ptr++;
+-                TCGOpcode neg_op;
--            t1 = tci_read_r(regs, &tb_ptr);
+-                bool have_neg;
--            tci_write_reg(regs, t0, (uint8_t)t1);
+-
 -                if (arg_is_const(op->args[2])) {
 -                    /* Proceed with possible constant folding. */
 -                    break;
 -                }
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    neg_op = INDEX_op_neg_i32;
 -                    have_neg = TCG_TARGET_HAS_neg_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    neg_op = INDEX_op_neg_i64;
 -                    have_neg = TCG_TARGET_HAS_neg_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 -                                                   TCGOP_VECE(op)) > 0;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_neg) {
 -                    break;
 -                }
 -                if (arg_is_const(op->args[1])
 -                    && arg_info(op->args[1])->val == 0) {
 -                    op->opc = neg_op;
 -                    reset_temp(op->args[0]);
 -                    op->args[1] = op->args[2];
 -                    continue;
 -                }
 -            }
 -            break;
--#endif
+         default:
--#if TCG_TARGET_HAS_ext8s_i64
+             break;
--        case INDEX_op_ext8s_i64:
+         }
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, (int8_t)t1);
 -            break;
 -#endif
 -#if TCG_TARGET_HAS_ext16s_i64
 -        case INDEX_op_ext16s_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, (int16_t)t1);
 -            break;
 -#endif
 -#if TCG_TARGET_HAS_ext16u_i64
 -        case INDEX_op_ext16u_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, (uint16_t)t1);
 -            break;
 -#endif
  #if TCG_TARGET_HAS_ext32s_i64
          case INDEX_op_ext32s_i64:
  #endif
 --
 .25.1

-[PATCH 24/27] accel/tcg: move CF_CLUSTER calculation to curr_cflags
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
-From: Alex Bennée <alex.bennee@linaro.org>
+Pull the "op r, a, i => mov r, a" optimization into a function,
 and use them in the outer-most logical operations.
-There is nothing special about this compile flag that doesn't mean we
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 can't just compute it with curr_cflags() which we should be using when
 building a new set.
 Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
 Message-Id: <20210224165811.11567-3-alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h   | 8 +++++---
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
- include/exec/tb-lookup.h  | 3 ---
+file changed, 26 insertions(+), 35 deletions(-)
  accel/tcg/cpu-exec.c      | 9 ++++-----
  accel/tcg/tcg-runtime.c   | 2 +-
  accel/tcg/translate-all.c | 6 +++---
  softmmu/physmem.c         | 2 +-
 files changed, 14 insertions(+), 16 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
- /* current cflags for hashing/comparison */
++/* If the binary operation has second argument @i, fold to identity. */
--static inline uint32_t curr_cflags(void)
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
-+static inline uint32_t curr_cflags(CPUState *cpu)
++{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to NOT. */
  static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
  {
--    return (parallel_cpus ? CF_PARALLEL : 0)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
--         | (icount_enabled() ? CF_USE_ICOUNT : 0);
-+    uint32_t cflags = deposit32(0, CF_CLUSTER_SHIFT, 8, cpu->cluster_index);
+ static bool fold_add(OptContext *ctx, TCGOp *op)
-+    cflags |= parallel_cpus ? CF_PARALLEL : 0;
+ {
-+    cflags |= icount_enabled() ? CF_USE_ICOUNT : 0;
+-    return fold_const2(ctx, op);
-+    return cflags;
++    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
- /* TranslationBlock invalidate API */
+ static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
-diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
+ {
---- a/include/exec/tb-lookup.h
+     if (fold_const2(ctx, op) ||
-+++ b/include/exec/tb-lookup.h
+         fold_xi_to_i(ctx, op, 0) ||
-@@ -XXX,XX +XXX,XX @@ static inline TranslationBlock * tb_lookup(CPUState *cpu,
++        fold_xi_to_x(ctx, op, -1) ||
-     hash = tb_jmp_cache_hash_func(pc);
+         fold_xx_to_x(ctx, op)) {
      tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
 -    cf_mask &= ~CF_CLUSTER_MASK;
 -    cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;
 -
      if (likely(tb &&
                 tb->pc == pc &&
                 tb->cs_base == cs_base &&
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
      TranslationBlock *tb;
      target_ulong cs_base, pc;
      uint32_t flags;
 -    uint32_t cflags = 1;
 -    uint32_t cf_mask = cflags & CF_HASH_MASK;
 +    uint32_t cflags = (curr_cflags(cpu) & ~CF_PARALLEL) | 1;
      int tb_exit;
      if (sigsetjmp(cpu->jmp_env, 0) == 0) {
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
          cpu->running = true;
          cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
 -        tb = tb_lookup(cpu, pc, cs_base, flags, cf_mask);
 +        tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
          if (tb == NULL) {
              mmap_lock();
@@ -XXX,XX +XXX,XX @@ static inline bool cpu_handle_exception(CPUState *cpu, int *ret)
          if (replay_has_exception()
              && cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra == 0) {
              /* Execute just one insn to trigger exception pending in the log */
 -            cpu->cflags_next_tb = (curr_cflags() & ~CF_USE_ICOUNT) | 1;
 +            cpu->cflags_next_tb = (curr_cflags(cpu) & ~CF_USE_ICOUNT) | 1;
          }
  #endif
          return false;
@@ -XXX,XX +XXX,XX @@ int cpu_exec(CPUState *cpu)
                 have CF_INVALID set, -1 is a convenient invalid value that
                 does not require tcg headers for cpu_common_reset.  */
              if (cflags == -1) {
 -                cflags = curr_cflags();
 +                cflags = curr_cflags(cpu);
              } else {
                  cpu->cflags_next_tb = -1;
              }
 diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-runtime.c
 +++ b/accel/tcg/tcg-runtime.c
@@ -XXX,XX +XXX,XX @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
 -    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags());
 +    tb = tb_lookup(cpu, pc, cs_base, flags, curr_cflags(cpu));
      if (tb == NULL) {
          return tcg_code_gen_epilogue;
      }
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
      if (current_tb_modified) {
          page_collection_unlock(pages);
          /* Force execution of one insn next time.  */
 -        cpu->cflags_next_tb = 1 | curr_cflags();
 +        cpu->cflags_next_tb = 1 | curr_cflags(cpu);
          mmap_unlock();
          cpu_loop_exit_noexc(cpu);
      }
@@ -XXX,XX +XXX,XX @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
  #ifdef TARGET_HAS_PRECISE_SMC
      if (current_tb_modified) {
          /* Force execution of one insn next time.  */
 -        cpu->cflags_next_tb = 1 | curr_cflags();
 +        cpu->cflags_next_tb = 1 | curr_cflags(cpu);
          return true;
      }
- #endif
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
+ {
-      * operations only (which execute after completion) so we don't
+     if (fold_const2(ctx, op) ||
-      * double instrument the instruction.
+         fold_xx_to_i(ctx, op, 0) ||
-      */
++        fold_xi_to_x(ctx, op, 0) ||
--    cpu->cflags_next_tb = curr_cflags() | CF_MEMI_ONLY | CF_LAST_IO | n;
+         fold_ix_to_not(ctx, op, -1)) {
-+    cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
+         return true;
+     }
-     qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
-                            "cpu_io_recompile: rewound execution of TB to "
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
-diff --git a/softmmu/physmem.c b/softmmu/physmem.c
+ {
-index XXXXXXX..XXXXXXX 100644
+     if (fold_const2(ctx, op) ||
---- a/softmmu/physmem.c
++        fold_xi_to_x(ctx, op, -1) ||
-+++ b/softmmu/physmem.c
+         fold_xi_to_not(ctx, op, 0)) {
-@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
+         return true;
-                     cpu_loop_exit_restore(cpu, ra);
+     }
-                 } else {
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
-                     /* Force execution of one insn next time.  */
+ static bool fold_or(OptContext *ctx, TCGOp *op)
--                    cpu->cflags_next_tb = 1 | curr_cflags();
+ {
-+                    cpu->cflags_next_tb = 1 | curr_cflags(cpu);
+     if (fold_const2(ctx, op) ||
-                     mmap_unlock();
++        fold_xi_to_x(ctx, op, 0) ||
-                     if (ra) {
+         fold_xx_to_x(ctx, op)) {
-                         cpu_restore_state(cpu, ra, true);
+         return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_sub_to_neg(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expression for "op r, a, const => mov r, a" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(sub):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64_VEC(andc):
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Simplify using known-zero bits. Currently only ops with a single
             output argument is supported. */
          z_mask = -1;
 --
 .25.1

-[PATCH 05/27] tcg: Split out tcg_raise_tb_overflow
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
-Allow other places in tcg to restart with a smaller tb.
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
 and use it in fold_shift.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tcg.c | 9 +++++++--
+ tcg/optimize.c | 28 ++++++++++------------------
-file changed, 7 insertions(+), 2 deletions(-)
+file changed, 10 insertions(+), 18 deletions(-)
-diff --git a/tcg/tcg.c b/tcg/tcg.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tcg.c
+--- a/tcg/optimize.c
-+++ b/tcg/tcg.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
-     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
+     return false;
  }
-+/* Signal overflow, starting over with fewer guest insns. */
++/* If the binary operation has first argument @i, fold to @i. */
-+static void QEMU_NORETURN tcg_raise_tb_overflow(TCGContext *s)
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
-+    siglongjmp(s->jmp_trans, -2);
++    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
- #define C_PFX1(P, A)                    P##A
+ /* If the binary operation has first argument @i, fold to NOT. */
- #define C_PFX2(P, A, B)                 P##A##_##B
+ static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
- #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
+ {
-@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_temp_alloc(TCGContext *s)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-     int n = s->nb_temps++;
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
-     if (n >= TCG_MAX_TEMPS) {
+     if (fold_const2(ctx, op) ||
--        /* Signal overflow, starting over with fewer guest insns. */
++        fold_ix_to_i(ctx, op, 0) ||
--        siglongjmp(s->jmp_trans, -2);
+         fold_xi_to_x(ctx, op, 0)) {
-+        tcg_raise_tb_overflow(s);
+         return true;
      }
-     return memset(&s->temps[n], 0, sizeof(TCGTemp));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- }
+             break;
          }
 -        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
 -           and "sub r, 0, a => neg r, a" case.  */
 -        switch (opc) {
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Simplify using known-zero bits. Currently only ops with a single
             output argument is supported. */
          z_mask = -1;
 --
 .25.1

-[PATCH 08/27] tcg/tci: Merge identical cases in generation (exchange opcodes)
+[PULL 43/56] tcg/optimize: Split out fold_masks
-Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+Move all of the known-zero optimizations into the per-opcode
-cases that are identical between 32-bit and 64-bit hosts.
+functions.  Use fold_masks when there is a possibility of the
 result being determined, and simply set ctx->z_mask otherwise.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
 [PMD: Split patch as 2/5]
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20210218232840.1760806-3-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.c.inc | 35 ++++++++++++++---------------------
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
-file changed, 14 insertions(+), 21 deletions(-)
+file changed, 294 insertions(+), 251 deletions(-)
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-         tcg_out8(s, args[2]);           /* condition */
+     TCGTempSet temps_used;
-         tci_out_label(s, arg_label(args[3]));
      /* In flight values from optimization. */
 -    uint64_t z_mask;
 +    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
 +    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
      TCGType type;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_masks(OptContext *ctx, TCGOp *op)
 +{
 +    uint64_t a_mask = ctx->a_mask;
 +    uint64_t z_mask = ctx->z_mask;
 +
 +    /*
 +     * 32-bit ops generate 32-bit results.  For the result is zero test
 +     * below, we can ignore high bits, but for further optimizations we
 +     * need to record that the high bits contain garbage.
 +     */
 +    if (ctx->type == TCG_TYPE_I32) {
 +        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
 +        a_mask &= MAKE_64BIT_MASK(0, 32);
 +        z_mask &= MAKE_64BIT_MASK(0, 32);
 +    }
 +
 +    if (z_mask == 0) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
 +    }
 +    if (a_mask == 0) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
  /*
   * Convert @op to NOT, if NOT is supported by the host.
   * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1, z2;
 +
      if (fold_const2(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +    z2 = arg_info(op->args[2])->z_mask;
 +    ctx->z_mask = z1 & z2;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer affected bits from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        ctx->a_mask = z1 & ~z2;
 +    }
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1;
 +
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer anything from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
 +        ctx->a_mask = z1 & ~z2;
 +        z1 &= z2;
 +    }
 +    ctx->z_mask = z1;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask, sign;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
          t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    switch (op->opc) {
 +    case INDEX_op_bswap16_i32:
 +    case INDEX_op_bswap16_i64:
 +        z_mask = bswap16(z_mask);
 +        sign = INT16_MIN;
 +        break;
 +    case INDEX_op_bswap32_i32:
 +    case INDEX_op_bswap32_i64:
 +        z_mask = bswap32(z_mask);
 +        sign = INT32_MIN;
 +        break;
 +    case INDEX_op_bswap64_i64:
 +        z_mask = bswap64(z_mask);
 +        sign = INT64_MIN;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 +    case TCG_BSWAP_OZ:
 +        break;
 +    case TCG_BSWAP_OS:
 +        /* If the sign bit may be 1, force all the bits above to 1. */
 +        if (z_mask & sign) {
 +            z_mask |= sign;
 +        }
 +        break;
 +    default:
 +        /* The high bits are undefined: force all bits above the sign to 1. */
 +        z_mask |= sign << 1;
 +        break;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
      }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        z_mask = 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        z_mask = 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +
      return false;
  }
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        ctx->z_mask = 32 | 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        ctx->z_mask = 64 | 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
  }
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          t1 = deposit64(t1, op->args[3], op->args[4], t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
 +
 +    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                            op->args[3], op->args[4],
 +                            arg_info(op->args[2])->z_mask);
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
          t = extract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask, sign;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8s):
 +        sign = INT8_MIN;
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16s):
 +        sign = INT16_MIN;
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_ext_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32s_i64:
 +        sign = INT32_MIN;
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    if (z_mask & sign) {
 +        z_mask |= sign;
 +    } else if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extu(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8u):
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16u):
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_extrl_i64_i32:
 +    case INDEX_op_extu_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32u_i64:
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    case INDEX_op_extrh_i64_i32:
 +        type_change = true;
 +        z_mask >>= 32;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    ctx->z_mask = z_mask;
 +    if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    return fold_masks(ctx, op);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
 +    ctx->z_mask = arg_info(op->args[3])->z_mask
 +                | arg_info(op->args[4])->z_mask;
 +
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
  static bool fold_neg(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
 +
 +    /* Set to 1 all bits to the left of the rightmost.  */
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    ctx->z_mask = -(z_mask & -z_mask);
 +
      /*
       * Because of fold_sub_to_neg, we want to always return true,
       * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 +    MemOp mop = get_memop(oi);
 +    int width = 8 * memop_size(mop);
 +
 +    if (!(mop & MO_SIGN) && width < 64) {
 +        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    }
 +
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
 +
 +    ctx->z_mask = 1;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          op->opc = INDEX_op_setcond_i32;
          break;
--    case INDEX_op_bswap16_i64:  /* Optional (TCG_TARGET_HAS_bswap16_i64). */
+     }
--    case INDEX_op_bswap32_i64:  /* Optional (TCG_TARGET_HAS_bswap32_i64). */
++
--    case INDEX_op_bswap64_i64:  /* Optional (TCG_TARGET_HAS_bswap64_i64). */
++    ctx->z_mask = 1;
--    case INDEX_op_not_i64:      /* Optional (TCG_TARGET_HAS_not_i64). */
+     return false;
--    case INDEX_op_neg_i64:      /* Optional (TCG_TARGET_HAS_neg_i64). */
--    case INDEX_op_ext8s_i64:    /* Optional (TCG_TARGET_HAS_ext8s_i64). */
+  do_setcond_const:
--    case INDEX_op_ext8u_i64:    /* Optional (TCG_TARGET_HAS_ext8u_i64). */
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
--    case INDEX_op_ext16s_i64:   /* Optional (TCG_TARGET_HAS_ext16s_i64). */
--    case INDEX_op_ext16u_i64:   /* Optional (TCG_TARGET_HAS_ext16u_i64). */
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
--    case INDEX_op_ext32s_i64:   /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+ {
--    case INDEX_op_ext32u_i64:   /* Optional (TCG_TARGET_HAS_ext32u_i64). */
++    int64_t z_mask_old, z_mask;
--    case INDEX_op_ext_i32_i64:
++
--    case INDEX_op_extu_i32_i64:
+     if (arg_is_const(op->args[1])) {
- #endif /* TCG_TARGET_REG_BITS == 64 */
+         uint64_t t;
--    case INDEX_op_neg_i32:      /* Optional (TCG_TARGET_HAS_neg_i32). */
--    case INDEX_op_not_i32:      /* Optional (TCG_TARGET_HAS_not_i32). */
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
--    case INDEX_op_ext8s_i32:    /* Optional (TCG_TARGET_HAS_ext8s_i32). */
+         t = sextract64(t, op->args[2], op->args[3]);
--    case INDEX_op_ext16s_i32:   /* Optional (TCG_TARGET_HAS_ext16s_i32). */
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
--    case INDEX_op_ext8u_i32:    /* Optional (TCG_TARGET_HAS_ext8u_i32). */
+     }
--    case INDEX_op_ext16u_i32:   /* Optional (TCG_TARGET_HAS_ext16u_i32). */
+-    return false;
--    case INDEX_op_bswap16_i32:  /* Optional (TCG_TARGET_HAS_bswap16_i32). */
++
--    case INDEX_op_bswap32_i32:  /* Optional (TCG_TARGET_HAS_bswap32_i32). */
++    z_mask_old = arg_info(op->args[1])->z_mask;
-+
++    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-+    CASE_32_64(neg)      /* Optional (TCG_TARGET_HAS_neg_*). */
++    if (op->args[2] == 0 && z_mask >= 0) {
-+    CASE_32_64(not)      /* Optional (TCG_TARGET_HAS_not_*). */
++        ctx->a_mask = z_mask_old ^ z_mask;
-+    CASE_32_64(ext8s)    /* Optional (TCG_TARGET_HAS_ext8s_*). */
++    }
-+    CASE_32_64(ext8u)    /* Optional (TCG_TARGET_HAS_ext8u_*). */
++    ctx->z_mask = z_mask;
-+    CASE_32_64(ext16s)   /* Optional (TCG_TARGET_HAS_ext16s_*). */
++
-+    CASE_32_64(ext16u)   /* Optional (TCG_TARGET_HAS_ext16u_*). */
++    return fold_masks(ctx, op);
-+    CASE_64(ext32s)      /* Optional (TCG_TARGET_HAS_ext32s_i64). */
+ }
-+    CASE_64(ext32u)      /* Optional (TCG_TARGET_HAS_ext32u_i64). */
-+    CASE_64(ext_i32)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
-+    CASE_64(extu_i32)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-+    CASE_32_64(bswap16)  /* Optional (TCG_TARGET_HAS_bswap16_*). */
+         fold_xi_to_x(ctx, op, 0)) {
-+    CASE_32_64(bswap32)  /* Optional (TCG_TARGET_HAS_bswap32_*). */
+         return true;
-+    CASE_64(bswap64)     /* Optional (TCG_TARGET_HAS_bswap64_i64). */
+     }
-         tcg_out_r(s, args[0]);
++
-         tcg_out_r(s, args[1]);
++    if (arg_is_const(op->args[2])) {
-         break;
++        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 +                                          arg_info(op->args[1])->z_mask,
 +                                          arg_info(op->args[2])->val);
 +        return fold_masks(ctx, op);
 +    }
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
      return fold_addsub2_i32(ctx, op, false);
  }
 +static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* We can't do any folding with a load, but we can record bits. */
 +    switch (op->opc) {
 +    CASE_OP_32_64(ld8u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        break;
 +    CASE_OP_32_64(ld16u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        break;
 +    case INDEX_op_ld32u_i64:
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t z_mask, partmask, affected, tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def;
          bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify using known-zero bits. Currently only ops with a single
 -           output argument is supported. */
 -        z_mask = -1;
 -        affected = -1;
 -        switch (opc) {
 -        CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext8u):
 -            z_mask = 0xff;
 -            goto and_const;
 -        CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext16u):
 -            z_mask = 0xffff;
 -            goto and_const;
 -        case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_ext32u_i64:
 -            z_mask = 0xffffffffU;
 -            goto and_const;
 -
 -        CASE_OP_32_64(and):
 -            z_mask = arg_info(op->args[2])->z_mask;
 -            if (arg_is_const(op->args[2])) {
 -        and_const:
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            z_mask = arg_info(op->args[1])->z_mask & z_mask;
 -            break;
 -
 -        case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_extu_i32_i64:
 -            /* We do not compute affected as it is a size changing op.  */
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(andc):
 -            /* Known-zeros does not imply known-ones.  Therefore unless
 -               op->args[2] is constant, we can't infer anything from it.  */
 -            if (arg_is_const(op->args[2])) {
 -                z_mask = ~arg_info(op->args[2])->z_mask;
 -                goto and_const;
 -            }
 -            /* But we certainly know nothing outside args[1] may be set. */
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        case INDEX_op_sar_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_sar_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_shr_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_shr_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_extrl_i64_i32:
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -        case INDEX_op_extrh_i64_i32:
 -            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
 -            break;
 -
 -        CASE_OP_32_64(shl):
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                z_mask = arg_info(op->args[1])->z_mask << tmp;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(neg):
 -            /* Set to 1 all bits to the left of the rightmost.  */
 -            z_mask = -(arg_info(op->args[1])->z_mask
 -                       & -arg_info(op->args[1])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(deposit):
 -            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 -                               op->args[3], op->args[4],
 -                               arg_info(op->args[2])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(extract):
 -            z_mask = extract64(arg_info(op->args[1])->z_mask,
 -                               op->args[2], op->args[3]);
 -            if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -        CASE_OP_32_64(sextract):
 -            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 -                                op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(xor):
 -            z_mask = arg_info(op->args[1])->z_mask
 -                   | arg_info(op->args[2])->z_mask;
 -            break;
 -
 -        case INDEX_op_clz_i32:
 -        case INDEX_op_ctz_i32:
 -            z_mask = arg_info(op->args[2])->z_mask | 31;
 -            break;
 -
 -        case INDEX_op_clz_i64:
 -        case INDEX_op_ctz_i64:
 -            z_mask = arg_info(op->args[2])->z_mask | 63;
 -            break;
 -
 -        case INDEX_op_ctpop_i32:
 -            z_mask = 32 | 31;
 -            break;
 -        case INDEX_op_ctpop_i64:
 -            z_mask = 64 | 63;
 -            break;
 -
 -        CASE_OP_32_64(setcond):
 -        case INDEX_op_setcond2_i32:
 -            z_mask = 1;
 -            break;
 -
 -        CASE_OP_32_64(movcond):
 -            z_mask = arg_info(op->args[3])->z_mask
 -                   | arg_info(op->args[4])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(ld8u):
 -            z_mask = 0xff;
 -            break;
 -        CASE_OP_32_64(ld16u):
 -            z_mask = 0xffff;
 -            break;
 -        case INDEX_op_ld32u_i64:
 -            z_mask = 0xffffffffu;
 -            break;
 -
 -        CASE_OP_32_64(qemu_ld):
 -            {
 -                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 -                MemOp mop = get_memop(oi);
 -                if (!(mop & MO_SIGN)) {
 -                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 -                }
 -            }
 -            break;
 -
 -        CASE_OP_32_64(bswap16):
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffff) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap16(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int16_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(16, 48);
 -                break;
 -            }
 -            break;
 -
 -        case INDEX_op_bswap32_i64:
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffffffffu) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap32(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int32_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(32, 32);
 -                break;
 -            }
 -            break;
 -
 -        default:
 -            break;
 -        }
 -
 -        /* 32-bit ops generate 32-bit results.  For the result is zero test
 -           below, we can ignore high bits, but for further optimizations we
 -           need to record that the high bits contain garbage.  */
 -        partmask = z_mask;
 -        if (ctx.type == TCG_TYPE_I32) {
 -            z_mask |= ~(tcg_target_ulong)0xffffffffu;
 -            partmask &= 0xffffffffu;
 -            affected &= 0xffffffffu;
 -        }
 -        ctx.z_mask = z_mask;
 -
 -        if (partmask == 0) {
 -            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -            continue;
 -        }
 -        if (affected == 0) {
 -            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            continue;
 -        }
 +        /* Assume all bits affected, and no bits known zero. */
 +        ctx.a_mask = -1;
 +        ctx.z_mask = -1;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32u_i64:
 +            done = fold_tcg_ld(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 --
 .25.1

-[PATCH 22/27] tcg/tci: Merge mov, not and neg operations
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
+and muls2_i64.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 29 +++++------------------------
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
-file changed, 5 insertions(+), 24 deletions(-)
+file changed, 35 insertions(+), 9 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-             tci_write_reg(regs, t0, tci_compare64(t1, t2, condition));
+     return false;
  }
 -static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 +static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
 -        uint32_t a = arg_info(op->args[2])->val;
 -        uint32_t b = arg_info(op->args[3])->val;
 -        uint64_t r = (uint64_t)a * b;
 +        uint64_t a = arg_info(op->args[2])->val;
 +        uint64_t b = arg_info(op->args[3])->val;
 +        uint64_t h, l;
          TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 +
 +        switch (op->opc) {
 +        case INDEX_op_mulu2_i32:
 +            l = (uint64_t)(uint32_t)a * (uint32_t)b;
 +            h = (int32_t)(l >> 32);
 +            l = (int32_t)l;
 +            break;
 +        case INDEX_op_muls2_i32:
 +            l = (int64_t)(int32_t)a * (int32_t)b;
 +            h = l >> 32;
 +            l = (int32_t)l;
 +            break;
 +        case INDEX_op_mulu2_i64:
 +            mulu64(&l, &h, a, b);
 +            break;
 +        case INDEX_op_muls2_i64:
 +            muls64(&l, &h, a, b);
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, l);
 +        tcg_opt_gen_movi(ctx, op2, rh, h);
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(muluh):
              done = fold_mul_highpart(&ctx, op);
              break;
- #endif
+-        case INDEX_op_mulu2_i32:
--        case INDEX_op_mov_i32:
+-            done = fold_mulu2_i32(&ctx, op);
-+        CASE_32_64(mov)
++        CASE_OP_32_64(muls2):
-             t0 = *tb_ptr++;
++        CASE_OP_32_64(mulu2):
-             t1 = tci_read_r(regs, &tb_ptr);
++            done = fold_multiply2(&ctx, op);
              tci_write_reg(regs, t0, t1);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              tci_write_reg(regs, t0, bswap32(t1));
              break;
- #endif
+         CASE_OP_32_64(nand):
--#if TCG_TARGET_HAS_not_i32
+             done = fold_nand(&ctx, op);
 -        case INDEX_op_not_i32:
 +#if TCG_TARGET_HAS_not_i32 || TCG_TARGET_HAS_not_i64
 +        CASE_32_64(not)
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, ~t1);
              break;
  #endif
 -#if TCG_TARGET_HAS_neg_i32
 -        case INDEX_op_neg_i32:
 +#if TCG_TARGET_HAS_neg_i32 || TCG_TARGET_HAS_neg_i64
 +        CASE_32_64(neg)
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, -t1);
              break;
  #endif
  #if TCG_TARGET_REG_BITS == 64
 -        case INDEX_op_mov_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 -            break;
          case INDEX_op_tci_movi_i64:
              t0 = *tb_ptr++;
              t1 = tci_read_i64(&tb_ptr);
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              tci_write_reg(regs, t0, bswap64(t1));
              break;
  #endif
 -#if TCG_TARGET_HAS_not_i64
 -        case INDEX_op_not_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, ~t1);
 -            break;
 -#endif
 -#if TCG_TARGET_HAS_neg_i64
 -        case INDEX_op_neg_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, -t1);
 -            break;
 -#endif
  #endif /* TCG_TARGET_REG_BITS == 64 */
              /* QEMU specific operations. */
 --
 .25.1

-[PATCH 13/27] tcg/tci: Remove tci_read_r8s
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-Use explicit casts for ext8s opcodes.
+Rename to fold_addsub2.
 Use Int128 to implement the wider operation.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 25 ++++---------------------
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
-file changed, 4 insertions(+), 21 deletions(-)
+file changed, 44 insertions(+), 21 deletions(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@
-     return regs[index];
+  */
  #include "qemu/osdep.h"
 +#include "qemu/int128.h"
  #include "tcg/tcg-op.h"
  #include "tcg-internal.h"
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
      return false;
  }
--#if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
+-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
--static int8_t tci_read_reg8s(const tcg_target_ulong *regs, TCGReg index)
++static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 -{
 -    return (int8_t)tci_read_reg(regs, index);
 -}
 -#endif
 -
  #if TCG_TARGET_HAS_ext16s_i32 || TCG_TARGET_HAS_ext16s_i64
  static int16_t tci_read_reg16s(const tcg_target_ulong *regs, TCGReg index)
  {
-@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
-     return value;
+         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
 -        uint32_t al = arg_info(op->args[2])->val;
 -        uint32_t ah = arg_info(op->args[3])->val;
 -        uint32_t bl = arg_info(op->args[4])->val;
 -        uint32_t bh = arg_info(op->args[5])->val;
 -        uint64_t a = ((uint64_t)ah << 32) | al;
 -        uint64_t b = ((uint64_t)bh << 32) | bl;
 +        uint64_t al = arg_info(op->args[2])->val;
 +        uint64_t ah = arg_info(op->args[3])->val;
 +        uint64_t bl = arg_info(op->args[4])->val;
 +        uint64_t bh = arg_info(op->args[5])->val;
          TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 -        if (add) {
 -            a += b;
 +        if (ctx->type == TCG_TYPE_I32) {
 +            uint64_t a = deposit64(al, 32, 32, ah);
 +            uint64_t b = deposit64(bl, 32, 32, bh);
 +
 +            if (add) {
 +                a += b;
 +            } else {
 +                a -= b;
 +            }
 +
 +            al = sextract64(a, 0, 32);
 +            ah = sextract64(a, 32, 32);
          } else {
 -            a -= b;
 +            Int128 a = int128_make128(al, ah);
 +            Int128 b = int128_make128(bl, bh);
 +
 +            if (add) {
 +                a = int128_add(a, b);
 +            } else {
 +                a = int128_sub(a, b);
 +            }
 +
 +            al = int128_getlo(a);
 +            ah = int128_gethi(a);
          }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, al);
 +        tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
      return false;
  }
--#if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
--/* Read indexed register (8 bit signed) from bytecode. */
++static bool fold_add2(OptContext *ctx, TCGOp *op)
--static int8_t tci_read_r8s(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+ {
--{
+-    return fold_addsub2_i32(ctx, op, true);
--    int8_t value = tci_read_reg8s(regs, **tb_ptr);
++    return fold_addsub2(ctx, op, true);
--    *tb_ptr += 1;
+ }
--    return value;
--}
+ static bool fold_and(OptContext *ctx, TCGOp *op)
--#endif
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
--
+     return false;
- /* Read indexed register (16 bit) from bytecode. */
+ }
- static uint16_t tci_read_r16(const tcg_target_ulong *regs,
-                              const uint8_t **tb_ptr)
+-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++static bool fold_sub2(OptContext *ctx, TCGOp *op)
- #if TCG_TARGET_HAS_ext8s_i32
+ {
-         case INDEX_op_ext8s_i32:
+-    return fold_addsub2_i32(ctx, op, false);
-             t0 = *tb_ptr++;
++    return fold_addsub2(ctx, op, false);
--            t1 = tci_read_r8s(regs, &tb_ptr);
+ }
--            tci_write_reg(regs, t0, t1);
-+            t1 = tci_read_r(regs, &tb_ptr);
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
-+            tci_write_reg(regs, t0, (int8_t)t1);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(add):
              done = fold_add(&ctx, op);
              break;
- #endif
+-        case INDEX_op_add2_i32:
- #if TCG_TARGET_HAS_ext16s_i32
+-            done = fold_add2_i32(&ctx, op);
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
++        CASE_OP_32_64(add2):
- #if TCG_TARGET_HAS_ext8s_i64
++            done = fold_add2(&ctx, op);
          case INDEX_op_ext8s_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r8s(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int8_t)t1);
              break;
- #endif
+         CASE_OP_32_64_VEC(and):
- #if TCG_TARGET_HAS_ext16s_i64
+             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
              done = fold_sub(&ctx, op);
              break;
 -        case INDEX_op_sub2_i32:
 -            done = fold_sub2_i32(&ctx, op);
 +        CASE_OP_32_64(sub2):
 +            done = fold_sub2(&ctx, op);
              break;
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
 --
 .25.1

-[PATCH 10/27] tcg/tci: Merge identical cases in generation (conditional opcodes)
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
-Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+Most of these are handled by creating a fold_const2_commutative
-cases that are identical between 32-bit and 64-bit hosts.
+to handle all of the binary operators.  The rest were already
+handled on a case-by-case basis in the switch, and have their
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+own fold function in which to place the call.
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
+We now have only one major switch on TCGOpcode.
-[PMD: Split patch as 4/5]
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Introduce NO_DEST and a block comment for swap_commutative in
-Message-Id: <20210218232840.1760806-5-f4bug@amsat.org>
+order to make the handling of brcond and movcond opcodes cleaner.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.c.inc | 23 ++++++-----------------
+ tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
-file changed, 6 insertions(+), 17 deletions(-)
+file changed, 70 insertions(+), 72 deletions(-)
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      return -1;
  }
 +/**
 + * swap_commutative:
 + * @dest: TCGArg of the destination argument, or NO_DEST.
 + * @p1: first paired argument
 + * @p2: second paired argument
 + *
 + * If *@p1 is a constant and *@p2 is not, swap.
 + * If *@p2 matches @dest, swap.
 + * Return true if a swap was performed.
 + */
 +
 +#define NO_DEST  temp_arg(NULL)
 +
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
  {
      TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 +{
 +    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
 +    /* Note that the high and low parts may be independently swapped. */
 +    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 +    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 +
      return fold_addsub2(ctx, op, true);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      uint64_t z1, z2;
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
 +        op->args[2] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
-         set_jmp_reset_offset(s, args[0]);
-         break;
+-        /* For commutative operations make constant second argument */
-+
+-        switch (opc) {
-     case INDEX_op_br:
+-        CASE_OP_32_64_VEC(add):
-         tci_out_label(s, arg_label(args[0]));
+-        CASE_OP_32_64_VEC(mul):
-         break;
+-        CASE_OP_32_64_VEC(and):
--    case INDEX_op_setcond_i32:
+-        CASE_OP_32_64_VEC(or):
-+
+-        CASE_OP_32_64_VEC(xor):
-+    CASE_32_64(setcond)
+-        CASE_OP_32_64(eqv):
-         tcg_out_r(s, args[0]);
+-        CASE_OP_32_64(nand):
-         tcg_out_r(s, args[1]);
+-        CASE_OP_32_64(nor):
-         tcg_out_r(s, args[2]);
+-        CASE_OP_32_64(muluh):
-         tcg_out8(s, args[3]);   /* condition */
+-        CASE_OP_32_64(mulsh):
-         break;
+-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-+
+-            break;
- #if TCG_TARGET_REG_BITS == 32
+-        CASE_OP_32_64(brcond):
-     case INDEX_op_setcond2_i32:
+-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-         /* setcond2_i32 cond, t0, t1_low, t1_high, t2_low, t2_high */
+-                op->args[2] = tcg_swap_cond(op->args[2]);
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+-            }
-         tcg_out_r(s, args[4]);
+-            break;
-         tcg_out8(s, args[5]);   /* condition */
+-        CASE_OP_32_64(setcond):
-         break;
+-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
--#elif TCG_TARGET_REG_BITS == 64
+-                op->args[3] = tcg_swap_cond(op->args[3]);
--    case INDEX_op_setcond_i64:
+-            }
--        tcg_out_r(s, args[0]);
+-            break;
--        tcg_out_r(s, args[1]);
+-        CASE_OP_32_64(movcond):
--        tcg_out_r(s, args[2]);
+-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
--        tcg_out8(s, args[3]);   /* condition */
+-                op->args[5] = tcg_swap_cond(op->args[5]);
--        break;
+-            }
- #endif
+-            /* For movcond, we canonicalize the "false" input reg to match
-     case INDEX_op_ld8u_i32:
+-               the destination reg so that the tcg backend can implement
-     case INDEX_op_ld8s_i32:
+-               a "move if true" operation.  */
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-         tcg_out8(s, args[4]);
+-                op->args[5] = tcg_invert_cond(op->args[5]);
-         break;
+-            }
+-            break;
--#if TCG_TARGET_REG_BITS == 64
+-        CASE_OP_32_64(add2):
--    case INDEX_op_brcond_i64:
+-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-+    CASE_32_64(brcond)
+-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-         tcg_out_r(s, args[0]);
+-            break;
-         tcg_out_r(s, args[1]);
+-        CASE_OP_32_64(mulu2):
-         tcg_out8(s, args[2]);           /* condition */
+-        CASE_OP_32_64(muls2):
-         tci_out_label(s, arg_label(args[3]));
+-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-         break;
+-            break;
--#endif /* TCG_TARGET_REG_BITS == 64 */
+-        case INDEX_op_brcond2_i32:
+-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-     CASE_32_64(neg)      /* Optional (TCG_TARGET_HAS_neg_*). */
+-                op->args[4] = tcg_swap_cond(op->args[4]);
-     CASE_32_64(not)      /* Optional (TCG_TARGET_HAS_not_*). */
+-            }
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+-            break;
-         tcg_out_r(s, args[3]);
+-        case INDEX_op_setcond2_i32:
-         break;
+-            if (swap_commutative2(&op->args[1], &op->args[3])) {
- #endif
+-                op->args[5] = tcg_swap_cond(op->args[5]);
--    case INDEX_op_brcond_i32:
+-            }
--        tcg_out_r(s, args[0]);
+-            break;
--        tcg_out_r(s, args[1]);
+-        default:
--        tcg_out8(s, args[2]);           /* condition */
+-            break;
--        tci_out_label(s, arg_label(args[3]));
+-        }
--        break;
+-
-+
+         /* Assume all bits affected, and no bits known zero. */
-     case INDEX_op_qemu_ld_i32:
+         ctx.a_mask = -1;
-         tcg_out_r(s, *args++);
+         ctx.z_mask = -1;
          tcg_out_r(s, *args++);
 --
 .25.1

-[PATCH 11/27] tcg/tci: Merge identical cases in generation (load/store opcodes)
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
-Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+This "garbage" setting pre-dates the addition of the type
-cases that are identical between 32-bit and 64-bit hosts.
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
 and INDEX_op_extr{l,h}_i64_i32.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+So now we have a definitive points at which to adjust z_mask
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+to eliminate such bits from the 32-bit operands.
-Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
-[PMD: Split patch as 5/5]
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Message-Id: <20210218232840.1760806-6-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.c.inc | 49 ++++++++++++----------------------------
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
-file changed, 14 insertions(+), 35 deletions(-)
+file changed, 16 insertions(+), 19 deletions(-)
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-         tcg_out8(s, args[5]);   /* condition */
+         ti->is_const = true;
-         break;
+         ti->val = ts->val;
- #endif
+         ti->z_mask = ts->val;
--    case INDEX_op_ld8u_i32:
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
--    case INDEX_op_ld8s_i32:
+-            /* High bits of a 32-bit quantity are garbage.  */
--    case INDEX_op_ld16u_i32:
+-            ti->z_mask |= ~0xffffffffull;
--    case INDEX_op_ld16s_i32:
+-        }
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      TCGTemp *src_ts = arg_temp(src);
      TempOptInfo *di;
      TempOptInfo *si;
 -    uint64_t z_mask;
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[0] = dst;
      op->args[1] = src;
 -    z_mask = si->z_mask;
 -    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
 -        /* High bits of the destination are now garbage.  */
 -        z_mask |= ~0xffffffffull;
 -    }
 -    di->z_mask = z_mask;
 +    di->z_mask = si->z_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    /* Convert movi to mov with constant temp. */
 -    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +    TCGTemp *tv;
 +    if (ctx->type == TCG_TYPE_I32) {
 +        val = (int32_t)val;
 +    }
 +
-+    CASE_32_64(ld8u)
++    /* Convert movi to mov with constant temp. */
-+    CASE_32_64(ld8s)
++    tv = tcg_constant_internal(ctx->type, val);
-+    CASE_32_64(ld16u)
+     init_ts_info(ctx, tv);
-+    CASE_32_64(ld16s)
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-     case INDEX_op_ld_i32:
+ }
--    case INDEX_op_st8_i32:
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
--    case INDEX_op_st16_i32:
+     uint64_t z_mask = ctx->z_mask;
-+    CASE_64(ld32u)
-+    CASE_64(ld32s)
+     /*
-+    CASE_64(ld)
+-     * 32-bit ops generate 32-bit results.  For the result is zero test
-+    CASE_32_64(st8)
+-     * below, we can ignore high bits, but for further optimizations we
-+    CASE_32_64(st16)
+-     * need to record that the high bits contain garbage.
-     case INDEX_op_st_i32:
++     * 32-bit ops generate 32-bit results, which for the purpose of
--    case INDEX_op_ld8u_i64:
++     * simplifying tcg are sign-extended.  Certainly that's how we
--    case INDEX_op_ld8s_i64:
++     * represent our constants elsewhere.  Note that the bits will
--    case INDEX_op_ld16u_i64:
++     * be reset properly for a 64-bit value when encountering the
--    case INDEX_op_ld16s_i64:
++     * type changing opcodes.
--    case INDEX_op_ld32u_i64:
+      */
--    case INDEX_op_ld32s_i64:
+     if (ctx->type == TCG_TYPE_I32) {
--    case INDEX_op_ld_i64:
+-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
--    case INDEX_op_st8_i64:
+-        a_mask &= MAKE_64BIT_MASK(0, 32);
--    case INDEX_op_st16_i64:
+-        z_mask &= MAKE_64BIT_MASK(0, 32);
--    case INDEX_op_st32_i64:
++        a_mask = (int32_t)a_mask;
--    case INDEX_op_st_i64:
++        z_mask = (int32_t)z_mask;
-+    CASE_64(st32)
++        ctx->z_mask = z_mask;
-+    CASE_64(st)
+     }
-         stack_bounds_check(args[1], args[2]);
-         tcg_out_r(s, args[0]);
+     if (z_mask == 0) {
          tcg_out_r(s, args[1]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
  #endif
      case INDEX_op_qemu_ld_i32:
 -        tcg_out_r(s, *args++);
 -        tcg_out_r(s, *args++);
 -        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
 -            tcg_out_r(s, *args++);
 -        }
 -        tcg_out_i(s, *args++);
 -        break;
 -    case INDEX_op_qemu_ld_i64:
 -        tcg_out_r(s, *args++);
 -        if (TCG_TARGET_REG_BITS == 32) {
 -            tcg_out_r(s, *args++);
 -        }
 -        tcg_out_r(s, *args++);
 -        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
 -            tcg_out_r(s, *args++);
 -        }
 -        tcg_out_i(s, *args++);
 -        break;
      case INDEX_op_qemu_st_i32:
          tcg_out_r(s, *args++);
          tcg_out_r(s, *args++);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          }
          tcg_out_i(s, *args++);
          break;
 +
 +    case INDEX_op_qemu_ld_i64:
      case INDEX_op_qemu_st_i64:
          tcg_out_r(s, *args++);
          if (TCG_TARGET_REG_BITS == 32) {
 --
 .25.1

-[PATCH 21/27] tcg/tci: Merge bswap operations
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
-This includes bswap16 and bswap32.
+Recognize the constant function for or-complement.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 22 ++++------------------
+ tcg/optimize.c | 1 +
-file changed, 4 insertions(+), 18 deletions(-)
+file changed, 1 insertion(+)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
-             tci_write_reg(regs, t0, (uint16_t)t1);
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
-             break;
+ {
- #endif
+     if (fold_const2(ctx, op) ||
--#if TCG_TARGET_HAS_bswap16_i32
++        fold_xx_to_i(ctx, op, -1) ||
--        case INDEX_op_bswap16_i32:
+         fold_xi_to_x(ctx, op, -1) ||
-+#if TCG_TARGET_HAS_bswap16_i32 || TCG_TARGET_HAS_bswap16_i64
+         fold_ix_to_not(ctx, op, 0)) {
-+        CASE_32_64(bswap16)
+         return true;
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap16(t1));
              break;
  #endif
 -#if TCG_TARGET_HAS_bswap32_i32
 -        case INDEX_op_bswap32_i32:
 +#if TCG_TARGET_HAS_bswap32_i32 || TCG_TARGET_HAS_bswap32_i64
 +        CASE_32_64(bswap32)
              t0 = *tb_ptr++;
              t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, bswap32(t1));
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
              t1 = tci_read_r(regs, &tb_ptr);
              tci_write_reg(regs, t0, (uint32_t)t1);
              break;
 -#if TCG_TARGET_HAS_bswap16_i64
 -        case INDEX_op_bswap16_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, bswap16(t1));
 -            break;
 -#endif
 -#if TCG_TARGET_HAS_bswap32_i64
 -        case INDEX_op_bswap32_i64:
 -            t0 = *tb_ptr++;
 -            t1 = tci_read_r(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, bswap32(t1));
 -            break;
 -#endif
  #if TCG_TARGET_HAS_bswap64_i64
          case INDEX_op_bswap64_i64:
              t0 = *tb_ptr++;
 --
 .25.1

-[PATCH 09/27] tcg/tci: Merge identical cases in generation (deposit opcode)
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
-Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+Recognize the identity function for low-part multiply.
 cases that are identical between 32-bit and 64-bit hosts.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
-[PMD: Split patch as 3/5]
-Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
-Message-Id: <20210218232840.1760806-4-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.c.inc | 12 ++----------
+ tcg/optimize.c | 3 ++-
-file changed, 2 insertions(+), 10 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-         tcg_out_r(s, args[1]);
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
-         tcg_out_r(s, args[2]);
+ {
-         break;
+     if (fold_const2(ctx, op) ||
--    case INDEX_op_deposit_i32:  /* Optional (TCG_TARGET_HAS_deposit_i32). */
+-        fold_xi_to_i(ctx, op, 0)) {
-+
++        fold_xi_to_i(ctx, op, 0) ||
-+    CASE_32_64(deposit)  /* Optional (TCG_TARGET_HAS_deposit_*). */
++        fold_xi_to_x(ctx, op, 1)) {
-         tcg_out_r(s, args[0]);
+         return true;
-         tcg_out_r(s, args[1]);
+     }
-         tcg_out_r(s, args[2]);
+     return false;
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          break;
  #if TCG_TARGET_REG_BITS == 64
 -    case INDEX_op_deposit_i64:  /* Optional (TCG_TARGET_HAS_deposit_i64). */
 -        tcg_out_r(s, args[0]);
 -        tcg_out_r(s, args[1]);
 -        tcg_out_r(s, args[2]);
 -        tcg_debug_assert(args[3] <= UINT8_MAX);
 -        tcg_out8(s, args[3]);
 -        tcg_debug_assert(args[4] <= UINT8_MAX);
 -        tcg_out8(s, args[4]);
 -        break;
      case INDEX_op_brcond_i64:
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
 --
 .25.1

-[PATCH 04/27] tcg/tci: Use exec/cpu_ldst.h interfaces
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
-Use the provided cpu_ldst.h interfaces.  This fixes the build vs
+Recognize the identity function for division.
 the unconverted uses of g2h(), adds missed memory trace events,
 and correctly recognizes when a SIGSEGV belongs to the guest via
 set_helper_retaddr().
-Fixes: 3e8f1628e864
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-Tested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 73 +++++++++++++++++++++----------------------------------
+ tcg/optimize.c | 6 +++++-
-file changed, 28 insertions(+), 45 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
-     return result;
  static bool fold_divide(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 1)) {
 +        return true;
 +    }
 +    return false;
  }
--#ifdef CONFIG_SOFTMMU
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
 -# define qemu_ld_ub \
 -    helper_ret_ldub_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_ld_leuw \
 -    helper_le_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_ld_leul \
 -    helper_le_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_ld_leq \
 -    helper_le_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_ld_beuw \
 -    helper_be_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_ld_beul \
 -    helper_be_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_ld_beq \
 -    helper_be_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_b(X) \
 -    helper_ret_stb_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_lew(X) \
 -    helper_le_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_lel(X) \
 -    helper_le_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_leq(X) \
 -    helper_le_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_bew(X) \
 -    helper_be_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_bel(X) \
 -    helper_be_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -# define qemu_st_beq(X) \
 -    helper_be_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
 -#else
 -# define qemu_ld_ub      ldub_p(g2h(taddr))
 -# define qemu_ld_leuw    lduw_le_p(g2h(taddr))
 -# define qemu_ld_leul    (uint32_t)ldl_le_p(g2h(taddr))
 -# define qemu_ld_leq     ldq_le_p(g2h(taddr))
 -# define qemu_ld_beuw    lduw_be_p(g2h(taddr))
 -# define qemu_ld_beul    (uint32_t)ldl_be_p(g2h(taddr))
 -# define qemu_ld_beq     ldq_be_p(g2h(taddr))
 -# define qemu_st_b(X)    stb_p(g2h(taddr), X)
 -# define qemu_st_lew(X)  stw_le_p(g2h(taddr), X)
 -# define qemu_st_lel(X)  stl_le_p(g2h(taddr), X)
 -# define qemu_st_leq(X)  stq_le_p(g2h(taddr), X)
 -# define qemu_st_bew(X)  stw_be_p(g2h(taddr), X)
 -# define qemu_st_bel(X)  stl_be_p(g2h(taddr), X)
 -# define qemu_st_beq(X)  stq_be_p(g2h(taddr), X)
 -#endif
 +#define qemu_ld_ub \
 +    cpu_ldub_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_ld_leuw \
 +    cpu_lduw_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_ld_leul \
 +    cpu_ldl_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_ld_leq \
 +    cpu_ldq_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_ld_beuw \
 +    cpu_lduw_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_ld_beul \
 +    cpu_ldl_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_ld_beq \
 +    cpu_ldq_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_b(X) \
 +    cpu_stb_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_lew(X) \
 +    cpu_stw_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_lel(X) \
 +    cpu_stl_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_leq(X) \
 +    cpu_stq_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_bew(X) \
 +    cpu_stw_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_bel(X) \
 +    cpu_stl_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 +#define qemu_st_beq(X) \
 +    cpu_stq_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
  #if TCG_TARGET_REG_BITS == 64
  # define CASE_32_64(x) \
 --
 .25.1

-[PATCH 17/27] tcg/tci: Remove tci_read_r32s
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-Use explicit casts for ext32s opcodes.
+Recognize the constant function for remainder.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 20 ++------------------
+ tcg/optimize.c | 6 +++++-
-file changed, 2 insertions(+), 18 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
-     return regs[index];
  static bool fold_remainder(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
--#if TCG_TARGET_REG_BITS == 64
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 -static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
 -{
 -    return (int32_t)tci_read_reg(regs, index);
 -}
 -#endif
 -
  #if TCG_TARGET_REG_BITS == 64
  static uint64_t tci_read_reg64(const tcg_target_ulong *regs, TCGReg index)
  {
@@ -XXX,XX +XXX,XX @@ static uint64_t tci_read_r64(const tcg_target_ulong *regs,
      return tci_uint64(tci_read_r(regs, tb_ptr), low);
  }
  #elif TCG_TARGET_REG_BITS == 64
 -/* Read indexed register (32 bit signed) from bytecode. */
 -static int32_t tci_read_r32s(const tcg_target_ulong *regs,
 -                             const uint8_t **tb_ptr)
 -{
 -    int32_t value = tci_read_reg32s(regs, **tb_ptr);
 -    *tb_ptr += 1;
 -    return value;
 -}
 -
  /* Read indexed register (64 bit) from bytecode. */
  static uint64_t tci_read_r64(const tcg_target_ulong *regs,
                               const uint8_t **tb_ptr)
@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
  #endif
          case INDEX_op_ext_i32_i64:
              t0 = *tb_ptr++;
 -            t1 = tci_read_r32s(regs, &tb_ptr);
 -            tci_write_reg(regs, t0, t1);
 +            t1 = tci_read_r(regs, &tb_ptr);
 +            tci_write_reg(regs, t0, (int32_t)t1);
              break;
  #if TCG_TARGET_HAS_ext32u_i64
          case INDEX_op_ext32u_i64:
 --
 .25.1

-[PATCH 02/27] tcg/aarch64: Fix I3617_CMLE0
+[PULL 52/56] tcg/optimize: Optimize sign extensions
-Fix a typo in the encodeing of the cmle (zero) instruction.
+Certain targets, like riscv, produce signed 32-bit results.
+This can lead to lots of redundant extensions as values are
-Fixes: 14e4c1e2355 ("tcg/aarch64: Add vector operations")
+manipulated.
 Begin by tracking only the obvious sign-extensions, and
 converting them to simple copies when possible.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.c.inc | 2 +-
+ tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 102 insertions(+), 21 deletions(-)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ typedef enum {
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     I3617_CMEQ0     = 0x0e209800,
+     TCGTemp *next_copy;
-     I3617_CMLT0     = 0x0e20a800,
+     uint64_t val;
-     I3617_CMGE0     = 0x2e208800,
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
--    I3617_CMLE0     = 0x2e20a800,
++    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
-+    I3617_CMLE0     = 0x2e209800,
+ } TempOptInfo;
-     I3617_NOT       = 0x2e205800,
-     I3617_ABS       = 0x0e20b800,
+ typedef struct OptContext {
-     I3617_NEG       = 0x2e20b800,
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
      uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
 +    uint64_t s_mask;  /* mask of clrsb(value) bits */
      TCGType type;
  } OptContext;
 +/* Calculate the smask for a specific value. */
 +static uint64_t smask_from_value(uint64_t value)
 +{
 +    int rep = clrsb64(value);
 +    return ~(~0ull >> rep);
 +}
 +
 +/*
 + * Calculate the smask for a given set of known-zeros.
 + * If there are lots of zeros on the left, we can consider the remainder
 + * an unsigned field, and thus the corresponding signed field is one bit
 + * larger.
 + */
 +static uint64_t smask_from_zmask(uint64_t zmask)
 +{
 +    /*
 +     * Only the 0 bits are significant for zmask, thus the msb itself
 +     * must be zero, else we have no sign information.
 +     */
 +    int rep = clz64(zmask);
 +    if (rep == 0) {
 +        return 0;
 +    }
 +    rep -= 1;
 +    return ~(~0ull >> rep);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->prev_copy = ts;
      ti->is_const = false;
      ti->z_mask = -1;
 +    ti->s_mask = 0;
  }
  static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
 +        ti->s_mask = smask_from_value(ts->val);
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
 +        ti->s_mask = 0;
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[1] = src;
      di->z_mask = si->z_mask;
 +    di->s_mask = si->s_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
 -        reset_temp(op->args[i]);
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        reset_ts(ts);
          /*
 -         * Save the corresponding known-zero bits mask for the
 +         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
 +    uint64_t s_mask = ctx->s_mask;
      /*
       * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      if (ctx->type == TCG_TYPE_I32) {
          a_mask = (int32_t)a_mask;
          z_mask = (int32_t)z_mask;
 +        s_mask |= MAKE_64BIT_MASK(32, 32);
          ctx->z_mask = z_mask;
 +        ctx->s_mask = s_mask;
      }
      if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask, sign;
 +    uint64_t z_mask, s_mask, sign;
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
      z_mask = arg_info(op->args[1])->z_mask;
 +
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
 +    s_mask = smask_from_zmask(z_mask);
      switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
      case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
          /* If the sign bit may be 1, force all the bits above to 1. */
          if (z_mask & sign) {
              z_mask |= sign;
 +            s_mask = sign << 1;
          }
          break;
      default:
          /* The high bits are undefined: force all bits above the sign to 1. */
          z_mask |= sign << 1;
 +        s_mask = 0;
          break;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask_old, z_mask;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = extract64(t, op->args[2], op->args[3]);
 +        t = extract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0) {
 +    z_mask = extract64(z_mask_old, pos, len);
 +    if (pos == 0) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask_old, z_mask, sign;
 +    uint64_t s_mask_old, s_mask, z_mask, sign;
      bool type_change = false;
      if (fold_const1(ctx, op)) {
          return true;
      }
 -    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    s_mask_old = s_mask;
      switch (op->opc) {
      CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
      if (z_mask & sign) {
          z_mask |= sign;
 -    } else if (!type_change) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
      }
 +    s_mask |= sign << 1;
 +
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
 +    if (!type_change) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      if (!type_change) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
 -    if (!(mop & MO_SIGN) && width < 64) {
 -        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    if (width < 64) {
 +        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
 +        if (!(mop & MO_SIGN)) {
 +            ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +            ctx->s_mask <<= 1;
 +        }
      }
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 -    int64_t z_mask_old, z_mask;
 +    uint64_t z_mask, s_mask, s_mask_old;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = sextract64(t, op->args[2], op->args[3]);
 +        t = sextract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0 && z_mask >= 0) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
 -    }
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask, pos, len);
      ctx->z_mask = z_mask;
 +    s_mask_old = arg_info(op->args[1])->s_mask;
 +    s_mask = sextract64(s_mask_old, pos, len);
 +    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
 +    ctx->s_mask = s_mask;
 +
 +    if (pos == 0) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
 +
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  {
      /* We can't do any folding with a load, but we can record bits. */
      switch (op->opc) {
 +    CASE_OP_32_64(ld8s):
 +        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
 +        break;
      CASE_OP_32_64(ld8u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
 +        break;
 +    CASE_OP_32_64(ld16s):
 +        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
          break;
      CASE_OP_32_64(ld16u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
 +        break;
 +    case INDEX_op_ld32s_i64:
 +        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
          break;
      case INDEX_op_ld32u_i64:
          ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
          break;
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* Assume all bits affected, and no bits known zero. */
 +        /* Assume all bits affected, no bits known zero, no sign reps. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 +        ctx.s_mask = 0;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8s):
          CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16s):
          CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32s_i64:
          case INDEX_op_ld32u_i64:
              done = fold_tcg_ld(&ctx, op);
              break;
 --
 .25.1

-[PATCH 12/27] tcg/tci: Remove tci_read_r8
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-Use explicit casts for ext8u opcodes, and allow truncation
+Sign repetitions are perforce all identical, whether they are 1 or 0.
-to happen with the store for st8 opcodes.
+Bitwise operations preserve the relative quantity of the repetitions.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci.c | 23 +++++------------------
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
-file changed, 5 insertions(+), 18 deletions(-)
+file changed, 29 insertions(+)
-diff --git a/tcg/tci.c b/tcg/tci.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci.c
+--- a/tcg/optimize.c
-+++ b/tcg/tci.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static int32_t tci_read_reg32s(const tcg_target_ulong *regs, TCGReg index)
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
      z2 = arg_info(op->args[2])->z_mask;
      ctx->z_mask = z1 & z2;
 +    /*
 +     * Sign repetitions are perforce all identical, whether they are 1 or 0.
 +     * Bitwise operations preserve the relative quantity of the repetitions.
 +     */
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
 +
      /*
       * Known-zeros does not imply known-ones.  Therefore unless
       * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z1;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
- #endif
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
--static uint8_t tci_read_reg8(const tcg_target_ulong *regs, TCGReg index)
+         fold_xi_to_not(ctx, op, 0)) {
--{
+         return true;
--    return (uint8_t)tci_read_reg(regs, index);
+     }
--}
++
--
++    ctx->s_mask = arg_info(op->args[1])->s_mask
- static uint16_t tci_read_reg16(const tcg_target_ulong *regs, TCGReg index)
++                & arg_info(op->args[2])->s_mask;
- {
+     return false;
      return (uint16_t)tci_read_reg(regs, index);
@@ -XXX,XX +XXX,XX @@ tci_read_r(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
      return value;
  }
--/* Read indexed register (8 bit) from bytecode. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
--static uint8_t tci_read_r8(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
--{
+     ctx->z_mask = arg_info(op->args[3])->z_mask
--    uint8_t value = tci_read_reg8(regs, **tb_ptr);
+                 | arg_info(op->args[4])->z_mask;
--    *tb_ptr += 1;
++    ctx->s_mask = arg_info(op->args[3])->s_mask
--    return value;
++                & arg_info(op->args[4])->s_mask;
--}
--
+     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
- #if TCG_TARGET_HAS_ext8s_i32 || TCG_TARGET_HAS_ext8s_i64
+         uint64_t tv = arg_info(op->args[3])->val;
- /* Read indexed register (8 bit signed) from bytecode. */
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
- static int8_t tci_read_r8s(const tcg_target_ulong *regs, const uint8_t **tb_ptr)
+         fold_xi_to_not(ctx, op, -1)) {
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+         return true;
-             tci_write_reg(regs, t0, *(uint32_t *)(t1 + t2));
+     }
-             break;
++
-         CASE_32_64(st8)
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--            t0 = tci_read_r8(regs, &tb_ptr);
++                & arg_info(op->args[2])->s_mask;
-+            t0 = tci_read_r(regs, &tb_ptr);
+     return false;
-             t1 = tci_read_r(regs, &tb_ptr);
+ }
-             t2 = tci_read_s32(&tb_ptr);
-             *(uint8_t *)(t1 + t2) = t0;
+@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+         fold_xi_to_not(ctx, op, 0)) {
- #if TCG_TARGET_HAS_ext8u_i32
+         return true;
-         case INDEX_op_ext8u_i32:
+     }
-             t0 = *tb_ptr++;
++
--            t1 = tci_read_r8(regs, &tb_ptr);
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--            tci_write_reg(regs, t0, t1);
++                & arg_info(op->args[2])->s_mask;
-+            t1 = tci_read_r(regs, &tb_ptr);
+     return false;
-+            tci_write_reg(regs, t0, (uint8_t)t1);
+ }
-             break;
- #endif
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
- #if TCG_TARGET_HAS_ext16u_i32
+         return true;
-@@ -XXX,XX +XXX,XX @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
+     }
- #if TCG_TARGET_HAS_ext8u_i64
-         case INDEX_op_ext8u_i64:
++    ctx->s_mask = arg_info(op->args[1])->s_mask;
-             t0 = *tb_ptr++;
++
--            t1 = tci_read_r8(regs, &tb_ptr);
+     /* Because of fold_to_not, we want to always return true, via finish. */
--            tci_write_reg(regs, t0, t1);
+     finish_folding(ctx, op);
-+            t1 = tci_read_r(regs, &tb_ptr);
+     return true;
-+            tci_write_reg(regs, t0, (uint8_t)t1);
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
-             break;
- #endif
+     ctx->z_mask = arg_info(op->args[1])->z_mask
- #if TCG_TARGET_HAS_ext8s_i64
+                 | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
 +
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
 --
 .25.1

-[PATCH 01/27] tcg/aarch64: Fix constant subtraction in tcg_out_addsub2
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-An hppa guest executing
+The result is either 0 or 1, which means that we have
 a 2 bit signed result, and thus 62 bits of sign.
 For clarity, use the smask_from_zmask function.
-x000000000000e05c:  ldil L%10000,r4
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-x000000000000e060:  ldo 0(r4),r4
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 x000000000000e064:  sub r3,r4,sp
 produces
  ---- 000000000000e064 000000000000e068
  sub2_i32 tmp0,tmp4,r3,$0x1,$0x10000,$0x0
 after folding and constant propagation.  Then we hit
 tcg-target.c.inc:640: tcg_out_insn_3401: Assertion `aimm <= 0xfff' failed.
 because aimm is in fact -16, but unsigned.
 The ((bl < 0) ^ sub) condition which negates bl is incorrect and will
 always lead to this abort.  If the constant is positive, sub will make
 it negative; if the constant is negative, sub will keep it negative.
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/aarch64/tcg-target.c.inc | 16 +++++++++-------
+ tcg/optimize.c | 2 ++
-file changed, 9 insertions(+), 7 deletions(-)
+file changed, 2 insertions(+)
-diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/aarch64/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/aarch64/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      }
+     ctx->z_mask = 1;
++    ctx->s_mask = smask_from_zmask(1);
+     return false;
  }
--static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 -                                   TCGReg rh, TCGReg al, TCGReg ah,
 -                                   tcg_target_long bl, tcg_target_long bh,
 -                                   bool const_bl, bool const_bh, bool sub)
 +static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
 +                            TCGReg rh, TCGReg al, TCGReg ah,
 +                            tcg_target_long bl, tcg_target_long bh,
 +                            bool const_bl, bool const_bh, bool sub)
  {
      TCGReg orig_rl = rl;
      AArch64Insn insn;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
      }
-     if (const_bl) {
+     ctx->z_mask = 1;
--        insn = I3401_ADDSI;
++    ctx->s_mask = smask_from_zmask(1);
--        if ((bl < 0) ^ sub) {
+     return false;
--            insn = I3401_SUBSI;
-+        if (bl < 0) {
+  do_setcond_const:
              bl = -bl;
 +            insn = sub ? I3401_ADDSI : I3401_SUBSI;
 +        } else {
 +            insn = sub ? I3401_SUBSI : I3401_ADDSI;
          }
 +
          if (unlikely(al == TCG_REG_XZR)) {
              /* ??? We want to allow al to be zero for the benefit of
                 negation via subtraction.  However, that leaves open the
 --
 .25.1

-[PATCH 07/27] tcg/tci: Merge identical cases in generation (arithmetic opcodes)
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-Use CASE_32_64 and CASE_64 to reduce ifdefs and merge
+The results are generally 6 bit unsigned values, though
-cases that are identical between 32-bit and 64-bit hosts.
+the count leading and trailing bits may produce any value
 for a zero input.
-Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Message-Id: <20210217202036.1724901-5-richard.henderson@linaro.org>
 [PMD: Split patch as 1/5]
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Message-Id: <20210218232840.1760806-2-f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- tcg/tci/tcg-target.c.inc | 85 +++++++++++++++++-----------------------
+ tcg/optimize.c | 3 ++-
-file changed, 37 insertions(+), 48 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/tcg/tci/tcg-target.c.inc
+--- a/tcg/optimize.c
-+++ b/tcg/tci/tcg-target.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
-     old_code_ptr[1] = s->code_ptr - old_code_ptr;
+         g_assert_not_reached();
      }
      ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 -
 +    ctx->s_mask = smask_from_zmask(ctx->z_mask);
      return false;
  }
-+#if TCG_TARGET_REG_BITS == 64
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-+# define CASE_32_64(x) \
+     default:
-+        case glue(glue(INDEX_op_, x), _i64): \
+         g_assert_not_reached();
-+        case glue(glue(INDEX_op_, x), _i32):
+     }
-+# define CASE_64(x) \
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
-+        case glue(glue(INDEX_op_, x), _i64):
+     return false;
-+#else
+ }
-+# define CASE_32_64(x) \
 +        case glue(glue(INDEX_op_, x), _i32):
 +# define CASE_64(x)
 +#endif
 +
  static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
                         const int *const_args)
  {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
      case INDEX_op_exit_tb:
          tcg_out64(s, args[0]);
          break;
 +
      case INDEX_op_goto_tb:
          if (s->tb_jmp_insn_offset) {
              /* Direct jump method. */
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          tcg_debug_assert(args[2] == (int32_t)args[2]);
          tcg_out32(s, args[2]);
          break;
 -    case INDEX_op_add_i32:
 -    case INDEX_op_sub_i32:
 -    case INDEX_op_mul_i32:
 -    case INDEX_op_and_i32:
 -    case INDEX_op_andc_i32:     /* Optional (TCG_TARGET_HAS_andc_i32). */
 -    case INDEX_op_eqv_i32:      /* Optional (TCG_TARGET_HAS_eqv_i32). */
 -    case INDEX_op_nand_i32:     /* Optional (TCG_TARGET_HAS_nand_i32). */
 -    case INDEX_op_nor_i32:      /* Optional (TCG_TARGET_HAS_nor_i32). */
 -    case INDEX_op_or_i32:
 -    case INDEX_op_orc_i32:      /* Optional (TCG_TARGET_HAS_orc_i32). */
 -    case INDEX_op_xor_i32:
 -    case INDEX_op_shl_i32:
 -    case INDEX_op_shr_i32:
 -    case INDEX_op_sar_i32:
 -    case INDEX_op_rotl_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
 -    case INDEX_op_rotr_i32:     /* Optional (TCG_TARGET_HAS_rot_i32). */
 +
 +    CASE_32_64(add)
 +    CASE_32_64(sub)
 +    CASE_32_64(mul)
 +    CASE_32_64(and)
 +    CASE_32_64(or)
 +    CASE_32_64(xor)
 +    CASE_32_64(andc)     /* Optional (TCG_TARGET_HAS_andc_*). */
 +    CASE_32_64(orc)      /* Optional (TCG_TARGET_HAS_orc_*). */
 +    CASE_32_64(eqv)      /* Optional (TCG_TARGET_HAS_eqv_*). */
 +    CASE_32_64(nand)     /* Optional (TCG_TARGET_HAS_nand_*). */
 +    CASE_32_64(nor)      /* Optional (TCG_TARGET_HAS_nor_*). */
 +    CASE_32_64(shl)
 +    CASE_32_64(shr)
 +    CASE_32_64(sar)
 +    CASE_32_64(rotl)     /* Optional (TCG_TARGET_HAS_rot_*). */
 +    CASE_32_64(rotr)     /* Optional (TCG_TARGET_HAS_rot_*). */
 +    CASE_32_64(div)      /* Optional (TCG_TARGET_HAS_div_*). */
 +    CASE_32_64(divu)     /* Optional (TCG_TARGET_HAS_div_*). */
 +    CASE_32_64(rem)      /* Optional (TCG_TARGET_HAS_div_*). */
 +    CASE_32_64(remu)     /* Optional (TCG_TARGET_HAS_div_*). */
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
          tcg_out_r(s, args[2]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          break;
  #if TCG_TARGET_REG_BITS == 64
 -    case INDEX_op_add_i64:
 -    case INDEX_op_sub_i64:
 -    case INDEX_op_mul_i64:
 -    case INDEX_op_and_i64:
 -    case INDEX_op_andc_i64:     /* Optional (TCG_TARGET_HAS_andc_i64). */
 -    case INDEX_op_eqv_i64:      /* Optional (TCG_TARGET_HAS_eqv_i64). */
 -    case INDEX_op_nand_i64:     /* Optional (TCG_TARGET_HAS_nand_i64). */
 -    case INDEX_op_nor_i64:      /* Optional (TCG_TARGET_HAS_nor_i64). */
 -    case INDEX_op_or_i64:
 -    case INDEX_op_orc_i64:      /* Optional (TCG_TARGET_HAS_orc_i64). */
 -    case INDEX_op_xor_i64:
 -    case INDEX_op_shl_i64:
 -    case INDEX_op_shr_i64:
 -    case INDEX_op_sar_i64:
 -    case INDEX_op_rotl_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
 -    case INDEX_op_rotr_i64:     /* Optional (TCG_TARGET_HAS_rot_i64). */
 -    case INDEX_op_div_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
 -    case INDEX_op_divu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
 -    case INDEX_op_rem_i64:      /* Optional (TCG_TARGET_HAS_div_i64). */
 -    case INDEX_op_remu_i64:     /* Optional (TCG_TARGET_HAS_div_i64). */
 -        tcg_out_r(s, args[0]);
 -        tcg_out_r(s, args[1]);
 -        tcg_out_r(s, args[2]);
 -        break;
      case INDEX_op_deposit_i64:  /* Optional (TCG_TARGET_HAS_deposit_i64). */
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          tcg_out_r(s, args[0]);
          tcg_out_r(s, args[1]);
          break;
 -    case INDEX_op_div_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
 -    case INDEX_op_divu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
 -    case INDEX_op_rem_i32:      /* Optional (TCG_TARGET_HAS_div_i32). */
 -    case INDEX_op_remu_i32:     /* Optional (TCG_TARGET_HAS_div_i32). */
 -        tcg_out_r(s, args[0]);
 -        tcg_out_r(s, args[1]);
 -        tcg_out_r(s, args[2]);
 -        break;
 +
  #if TCG_TARGET_REG_BITS == 32
      case INDEX_op_add2_i32:
      case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
          }
          tcg_out_i(s, *args++);
          break;
 +
      case INDEX_op_mb:
          break;
 +
      case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
      case INDEX_op_mov_i64:
      case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
 --
 .25.1

-[PATCH 27/27] accel/tcg: Precompute curr_cflags into cpu->tcg_cflags
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-The primary motivation is to remove a dozen insns along
+For constant shifts, we can simply shift the s_mask.
 the fast-path in tb_lookup.  As a byproduct, this allows
 us to completely remove parallel_cpus.
+For variable shifts, we know that sar does not reduce
+the s_mask, which helps for sequences like
+    ext32s_i64  t, in
+    sar_i64     t, t, v
+    ext32s_i64  out, t
+allowing the final extend to be eliminated.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/tcg-accel-ops.h       |  1 +
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
- include/exec/exec-all.h         |  7 +------
+file changed, 47 insertions(+), 3 deletions(-)
  include/hw/core/cpu.h           |  2 ++
  accel/tcg/cpu-exec.c            |  3 ---
  accel/tcg/tcg-accel-ops-mttcg.c |  3 +--
  accel/tcg/tcg-accel-ops-rr.c    |  2 +-
  accel/tcg/tcg-accel-ops.c       |  8 ++++++++
  accel/tcg/translate-all.c       |  4 ----
  linux-user/main.c               |  1 +
  linux-user/sh4/signal.c         |  8 +++++---
  linux-user/syscall.c            | 18 ++++++++++--------
 files changed, 30 insertions(+), 27 deletions(-)
-diff --git a/accel/tcg/tcg-accel-ops.h b/accel/tcg/tcg-accel-ops.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/tcg-accel-ops.h
+--- a/tcg/optimize.c
-+++ b/accel/tcg/tcg-accel-ops.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
- void tcg_cpus_destroy(CPUState *cpu);
+     return ~(~0ull >> rep);
  int tcg_cpus_exec(CPUState *cpu);
  void tcg_handle_interrupt(CPUState *cpu, int mask);
 +void tcg_cpu_init_cflags(CPUState *cpu, bool parallel);
  #endif /* TCG_CPUS_H */
 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/exec-all.h
 +++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ struct TranslationBlock {
      uintptr_t jmp_dest[2];
  };
 -extern bool parallel_cpus;
 -
  /* Hide the qatomic_read to make code a little easier on the eyes */
  static inline uint32_t tb_cflags(const TranslationBlock *tb)
  {
@@ -XXX,XX +XXX,XX @@ static inline uint32_t tb_cflags(const TranslationBlock *tb)
  /* current cflags for hashing/comparison */
  static inline uint32_t curr_cflags(CPUState *cpu)
  {
 -    uint32_t cflags = deposit32(0, CF_CLUSTER_SHIFT, 8, cpu->cluster_index);
 -    cflags |= parallel_cpus ? CF_PARALLEL : 0;
 -    cflags |= icount_enabled() ? CF_USE_ICOUNT : 0;
 -    return cflags;
 +    return cpu->tcg_cflags;
  }
- /* TranslationBlock invalidate API */
++/*
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
++ * Recreate a properly left-aligned smask after manipulation.
-index XXXXXXX..XXXXXXX 100644
++ * Some bit-shuffling, particularly shifts and rotates, may
---- a/include/hw/core/cpu.h
++ * retain sign bits on the left, but may scatter disconnected
-+++ b/include/hw/core/cpu.h
++ * sign bits on the right.  Retain only what remains to the left.
-@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
++ */
-  *   to a cluster this will be UNASSIGNED_CLUSTER_INDEX; otherwise it will
++static uint64_t smask_from_smask(int64_t smask)
   *   be the same as the cluster-id property of the CPU object's TYPE_CPU_CLUSTER
   *   QOM parent.
 + * @tcg_cflags: Pre-computed cflags for this cpu.
   * @nr_cores: Number of cores within this CPU package.
   * @nr_threads: Number of threads within this CPU.
   * @running: #true if CPU is currently running (lockless).
@@ -XXX,XX +XXX,XX @@ struct CPUState {
      /* TODO Move common fields from CPUArchState here. */
      int cpu_index;
      int cluster_index;
 +    uint32_t tcg_cflags;
      uint32_t halted;
      uint32_t can_do_io;
      int32_t exception_index;
 diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cpu-exec.c
 +++ b/accel/tcg/cpu-exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
              mmap_unlock();
          }
 -        /* Since we got here, we know that parallel_cpus must be true.  */
 -        parallel_cpus = false;
          cpu_exec_enter(cpu);
          /* execute the generated code */
          trace_exec_tb(tb, pc);
@@ -XXX,XX +XXX,XX @@ void cpu_exec_step_atomic(CPUState *cpu)
       * the execution.
       */
      g_assert(cpu_in_exclusive_context(cpu));
 -    parallel_cpus = true;
      cpu->running = false;
      end_exclusive();
  }
 diff --git a/accel/tcg/tcg-accel-ops-mttcg.c b/accel/tcg/tcg-accel-ops-mttcg.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops-mttcg.c
 +++ b/accel/tcg/tcg-accel-ops-mttcg.c
@@ -XXX,XX +XXX,XX @@ void mttcg_start_vcpu_thread(CPUState *cpu)
      char thread_name[VCPU_THREAD_NAME_SIZE];
      g_assert(tcg_enabled());
 -
 -    parallel_cpus = (current_machine->smp.max_cpus > 1);
 +    tcg_cpu_init_cflags(cpu, current_machine->smp.max_cpus > 1);
      cpu->thread = g_malloc0(sizeof(QemuThread));
      cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops-rr.c
 +++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -XXX,XX +XXX,XX @@ void rr_start_vcpu_thread(CPUState *cpu)
      static QemuThread *single_tcg_cpu_thread;
      g_assert(tcg_enabled());
 -    parallel_cpus = false;
 +    tcg_cpu_init_cflags(cpu, false);
      if (!single_tcg_cpu_thread) {
          cpu->thread = g_malloc0(sizeof(QemuThread));
 diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/tcg-accel-ops.c
 +++ b/accel/tcg/tcg-accel-ops.c
@@ -XXX,XX +XXX,XX @@
  /* common functionality among all TCG variants */
 +void tcg_cpu_init_cflags(CPUState *cpu, bool parallel)
 +{
-+    uint32_t cflags = cpu->cluster_index << CF_CLUSTER_SHIFT;
++    /* Only the 1 bits are significant for smask */
-+    cflags |= parallel ? CF_PARALLEL : 0;
++    return smask_from_zmask(~smask);
 +    cflags |= icount_enabled() ? CF_USE_ICOUNT : 0;
 +    cpu->tcg_cflags = cflags;
 +}
 +
- void tcg_cpus_destroy(CPUState *cpu)
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
-     cpu_thread_signal_destroyed(cpu);
+     return ts->state_ptr;
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.c
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void *l1_map[V_L1_MAX_SIZE];
  TCGContext tcg_init_ctx;
  __thread TCGContext *tcg_ctx;
  TBContext tb_ctx;
 -bool parallel_cpus;
  static void page_table_config_init(void)
  {
-@@ -XXX,XX +XXX,XX @@ TranslationBlock *tb_gen_code(CPUState *cpu,
++    uint64_t s_mask, z_mask, sign;
-         cflags = (cflags & ~CF_COUNT_MASK) | 1;
++
      if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
--    cflags &= ~CF_CLUSTER_MASK;
++    s_mask = arg_info(op->args[1])->s_mask;
--    cflags |= cpu->cluster_index << CF_CLUSTER_SHIFT;
++    z_mask = arg_info(op->args[1])->z_mask;
--
++
-     max_insns = cflags & CF_COUNT_MASK;
+     if (arg_is_const(op->args[2])) {
-     if (max_insns == 0) {
+-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-         max_insns = CF_COUNT_MASK;
+-                                          arg_info(op->args[1])->z_mask,
-diff --git a/linux-user/main.c b/linux-user/main.c
+-                                          arg_info(op->args[2])->val);
-index XXXXXXX..XXXXXXX 100644
++        int sh = arg_info(op->args[2])->val;
---- a/linux-user/main.c
++
-+++ b/linux-user/main.c
++        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env)
++
-     /* Reset non arch specific state */
++        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
-     cpu_reset(new_cpu);
++        ctx->s_mask = smask_from_smask(s_mask);
++
-+    new_cpu->tcg_cflags = cpu->tcg_cflags;
+         return fold_masks(ctx, op);
-     memcpy(new_env, env, sizeof(CPUArchState));
+     }
++
-     /* Clone all break/watchpoints.
++    switch (op->opc) {
-diff --git a/linux-user/sh4/signal.c b/linux-user/sh4/signal.c
++    CASE_OP_32_64(sar):
-index XXXXXXX..XXXXXXX 100644
++        /*
---- a/linux-user/sh4/signal.c
++         * Arithmetic right shift will not reduce the number of
-+++ b/linux-user/sh4/signal.c
++         * input sign repetitions.
-@@ -XXX,XX +XXX,XX @@ static abi_ulong get_sigframe(struct target_sigaction *ka,
++         */
-     return (sp - frame_size) & -8ul;
++        ctx->s_mask = s_mask;
 +        break;
 +    CASE_OP_32_64(shr):
 +        /*
 +         * If the sign bit is known zero, then logical right shift
 +         * will not reduced the number of input sign repetitions.
 +         */
 +        sign = (s_mask & -s_mask) >> 1;
 +        if (!(z_mask & sign)) {
 +            ctx->s_mask = s_mask;
 +        }
 +        break;
 +    default:
 +        break;
 +    }
 +
      return false;
  }
--/* Notice when we're in the middle of a gUSA region and reset.
--   Note that this will only occur for !parallel_cpus, as we will
--   translate such sequences differently in a parallel context.  */
-+/*
-+ * Notice when we're in the middle of a gUSA region and reset.
-+ * Note that this will only occur when #CF_PARALLEL is unset, as we
-+ * will translate such sequences differently in a parallel context.
-+ */
- static void unwind_gusa(CPUSH4State *regs)
- {
-     /* If the stack pointer is sufficiently negative, and we haven't
-diff --git a/linux-user/syscall.c b/linux-user/syscall.c
-index XXXXXXX..XXXXXXX 100644
---- a/linux-user/syscall.c
-+++ b/linux-user/syscall.c
-@@ -XXX,XX +XXX,XX @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
-         /* Grab a mutex so that thread setup appears atomic.  */
-         pthread_mutex_lock(&clone_lock);
-+        /*
-+         * If this is our first additional thread, we need to ensure we
-+         * generate code for parallel execution and flush old translations.
-+         * Do this now so that the copy gets CF_PARALLEL too.
-+         */
-+        if (!(cpu->tcg_cflags & CF_PARALLEL)) {
-+            cpu->tcg_cflags |= CF_PARALLEL;
-+            tb_flush(cpu);
-+        }
-+
-         /* we create a new CPU instance. */
-         new_env = cpu_copy(env);
-         /* Init regs that differ from the parent.  */
-@@ -XXX,XX +XXX,XX @@ static int do_fork(CPUArchState *env, unsigned int flags, abi_ulong newsp,
-         sigprocmask(SIG_BLOCK, &sigmask, &info.sigmask);
-         cpu->random_seed = qemu_guest_random_seed_thread_part1();
--        /* If this is our first additional thread, we need to ensure we
--         * generate code for parallel execution and flush old translations.
--         */
--        if (!parallel_cpus) {
--            parallel_cpus = true;
--            tb_flush(cpu);
--        }
--
-         ret = pthread_create(&info.thread, &attr, clone_func, &info);
-         /* TODO: Free new CPU state if thread creation failed.  */
 --
 .25.1

Pulling together some cleanups, fixes, and prepatory tci stuff.
Most of this has been reviewed, but not all.

Those lacking review:

01-tcg-aarch64-Fix-constant-subtraction-in-tcg_out_adds.patch
02-tcg-aarch64-Fix-I3617_CMLE0.patch
03-tcg-aarch64-Fix-generation-of-scalar-vector-operatio.patch
04-tcg-tci-Use-exec-cpu_ldst.h-interfaces.patch
06-tcg-Manage-splitwx-in-tc_ptr_to_region_tree-by-hand.patch
23-accel-tcg-rename-tb_lookup__cpu_state-and-hoist-stat.patch
24-accel-tcg-move-CF_CLUSTER-calculation-to-curr_cflags.patch
25-accel-tcg-drop-the-use-of-CF_HASH_MASK-and-rename-pa.patch
26-include-exec-lightly-re-arrange-TranslationBlock.patch
27-accel-tcg-Precompute-curr_cflags-into-cpu-tcg_cflags.patch

Alex, the last patch is a re-write and extension of one that
you did review.

Alex Bennée (4):
  accel/tcg: rename tb_lookup__cpu_state and hoist state extraction
  accel/tcg: move CF_CLUSTER calculation to curr_cflags
  accel/tcg: drop the use of CF_HASH_MASK and rename params
  include/exec: lightly re-arrange TranslationBlock

Richard Henderson (23):
  tcg/aarch64: Fix constant subtraction in tcg_out_addsub2
  tcg/aarch64: Fix I3617_CMLE0
  tcg/aarch64: Fix generation of "scalar" vector operations
  tcg/tci: Use exec/cpu_ldst.h interfaces
  tcg: Split out tcg_raise_tb_overflow
  tcg: Manage splitwx in tc_ptr_to_region_tree by hand
  tcg/tci: Merge identical cases in generation (arithmetic opcodes)
  tcg/tci: Merge identical cases in generation (exchange opcodes)
  tcg/tci: Merge identical cases in generation (deposit opcode)
  tcg/tci: Merge identical cases in generation (conditional opcodes)
  tcg/tci: Merge identical cases in generation (load/store opcodes)
  tcg/tci: Remove tci_read_r8
  tcg/tci: Remove tci_read_r8s
  tcg/tci: Remove tci_read_r16
  tcg/tci: Remove tci_read_r16s
  tcg/tci: Remove tci_read_r32
  tcg/tci: Remove tci_read_r32s
  tcg/tci: Reduce use of tci_read_r64
  tcg/tci: Merge basic arithmetic operations
  tcg/tci: Merge extension operations
  tcg/tci: Merge bswap operations
  tcg/tci: Merge mov, not and neg operations
  accel/tcg: Precompute curr_cflags into cpu->tcg_cflags

-- 
2.25.1

An hppa guest executing

0x000000000000e05c:  ldil L%10000,r4
0x000000000000e060:  ldo 0(r4),r4
0x000000000000e064:  sub r3,r4,sp

produces

---- 000000000000e064 000000000000e068
 sub2_i32 tmp0,tmp4,r3,$0x1,$0x10000,$0x0

after folding and constant propagation.  Then we hit

tcg-target.c.inc:640: tcg_out_insn_3401: Assertion `aimm <= 0xfff' failed.

because aimm is in fact -16, but unsigned.

The ((bl < 0) ^ sub) condition which negates bl is incorrect and will
always lead to this abort.  If the constant is positive, sub will make
it negative; if the constant is negative, sub will keep it negative.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
     }
 }
 
-static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
-                                   TCGReg rh, TCGReg al, TCGReg ah,
-                                   tcg_target_long bl, tcg_target_long bh,
-                                   bool const_bl, bool const_bh, bool sub)
+static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
+                            TCGReg rh, TCGReg al, TCGReg ah,
+                            tcg_target_long bl, tcg_target_long bh,
+                            bool const_bl, bool const_bh, bool sub)
 {
     TCGReg orig_rl = rl;
     AArch64Insn insn;
@@ -XXX,XX +XXX,XX @@ static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
     }
 
     if (const_bl) {
-        insn = I3401_ADDSI;
-        if ((bl < 0) ^ sub) {
-            insn = I3401_SUBSI;
+        if (bl < 0) {
             bl = -bl;
+            insn = sub ? I3401_ADDSI : I3401_SUBSI;
+        } else {
+            insn = sub ? I3401_SUBSI : I3401_ADDSI;
         }
+
         if (unlikely(al == TCG_REG_XZR)) {
             /* ??? We want to allow al to be zero for the benefit of
                negation via subtraction.  However, that leaves open the
-- 
2.25.1

For some vector operations, "1D" is not a valid type, and there
are separate instructions for the 64-bit scalar operation.

Tested-by: Stefan Weil <sw@weilnetz.de>
Buglink: https://bugs.launchpad.net/qemu/+bug/1916112
Fixes: 14e4c1e2355 ("tcg/aarch64: Add vector operations")
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.c.inc | 211 ++++++++++++++++++++++++++++++-----
 1 file changed, 181 insertions(+), 30 deletions(-)

diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@@ -XXX,XX +XXX,XX @@ typedef enum {
     I3606_BIC       = 0x2f001400,
     I3606_ORR       = 0x0f001400,
 
+    /* AdvSIMD scalar shift by immediate */
+    I3609_SSHR      = 0x5f000400,
+    I3609_SSRA      = 0x5f001400,
+    I3609_SHL       = 0x5f005400,
+    I3609_USHR      = 0x7f000400,
+    I3609_USRA      = 0x7f001400,
+    I3609_SLI       = 0x7f005400,
+
+    /* AdvSIMD scalar three same */
+    I3611_SQADD     = 0x5e200c00,
+    I3611_SQSUB     = 0x5e202c00,
+    I3611_CMGT      = 0x5e203400,
+    I3611_CMGE      = 0x5e203c00,
+    I3611_SSHL      = 0x5e204400,
+    I3611_ADD       = 0x5e208400,
+    I3611_CMTST     = 0x5e208c00,
+    I3611_UQADD     = 0x7e200c00,
+    I3611_UQSUB     = 0x7e202c00,
+    I3611_CMHI      = 0x7e203400,
+    I3611_CMHS      = 0x7e203c00,
+    I3611_USHL      = 0x7e204400,
+    I3611_SUB       = 0x7e208400,
+    I3611_CMEQ      = 0x7e208c00,
+
+    /* AdvSIMD scalar two-reg misc */
+    I3612_CMGT0     = 0x5e208800,
+    I3612_CMEQ0     = 0x5e209800,
+    I3612_CMLT0     = 0x5e20a800,
+    I3612_ABS       = 0x5e20b800,
+    I3612_CMGE0     = 0x7e208800,
+    I3612_CMLE0     = 0x7e209800,
+    I3612_NEG       = 0x7e20b800,
+
     /* AdvSIMD shift by immediate */
     I3614_SSHR      = 0x0f000400,
     I3614_SSRA      = 0x0f001400,
@@ -XXX,XX +XXX,XX @@ static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
               | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
 }
 
+static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
+                              TCGReg rd, TCGReg rn, unsigned immhb)
+{
+    tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
+                              unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
+{
+    tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
+              | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
+                              unsigned size, TCGReg rd, TCGReg rn)
+{
+    tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
 static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
                               TCGReg rd, TCGReg rn, unsigned immhb)
 {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            unsigned vecl, unsigned vece,
                            const TCGArg *args, const int *const_args)
 {
-    static const AArch64Insn cmp_insn[16] = {
+    static const AArch64Insn cmp_vec_insn[16] = {
         [TCG_COND_EQ] = I3616_CMEQ,
         [TCG_COND_GT] = I3616_CMGT,
         [TCG_COND_GE] = I3616_CMGE,
         [TCG_COND_GTU] = I3616_CMHI,
         [TCG_COND_GEU] = I3616_CMHS,
     };
-    static const AArch64Insn cmp0_insn[16] = {
+    static const AArch64Insn cmp_scalar_insn[16] = {
+        [TCG_COND_EQ] = I3611_CMEQ,
+        [TCG_COND_GT] = I3611_CMGT,
+        [TCG_COND_GE] = I3611_CMGE,
+        [TCG_COND_GTU] = I3611_CMHI,
+        [TCG_COND_GEU] = I3611_CMHS,
+    };
+    static const AArch64Insn cmp0_vec_insn[16] = {
         [TCG_COND_EQ] = I3617_CMEQ0,
         [TCG_COND_GT] = I3617_CMGT0,
         [TCG_COND_GE] = I3617_CMGE0,
         [TCG_COND_LT] = I3617_CMLT0,
         [TCG_COND_LE] = I3617_CMLE0,
     };
+    static const AArch64Insn cmp0_scalar_insn[16] = {
+        [TCG_COND_EQ] = I3612_CMEQ0,
+        [TCG_COND_GT] = I3612_CMGT0,
+        [TCG_COND_GE] = I3612_CMGE0,
+        [TCG_COND_LT] = I3612_CMLT0,
+        [TCG_COND_LE] = I3612_CMLE0,
+    };
 
     TCGType type = vecl + TCG_TYPE_V64;
     unsigned is_q = vecl;
+    bool is_scalar = !is_q && vece == MO_64;
     TCGArg a0, a1, a2, a3;
     int cmode, imm8;
 
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
         break;
     case INDEX_op_add_vec:
-        tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_sub_vec:
-        tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_mul_vec:
         tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
         break;
     case INDEX_op_neg_vec:
-        tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
+        if (is_scalar) {
+            tcg_out_insn(s, 3612, NEG, vece, a0, a1);
+        } else {
+            tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
+        }
         break;
     case INDEX_op_abs_vec:
-        tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
+        if (is_scalar) {
+            tcg_out_insn(s, 3612, ABS, vece, a0, a1);
+        } else {
+            tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
+        }
         break;
     case INDEX_op_and_vec:
         if (const_args[2]) {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
         break;
     case INDEX_op_ssadd_vec:
-        tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_sssub_vec:
-        tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_usadd_vec:
-        tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_ussub_vec:
-        tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_smax_vec:
         tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
         break;
     case INDEX_op_shli_vec:
-        tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
+        } else {
+            tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
+        }
         break;
     case INDEX_op_shri_vec:
-        tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
+        } else {
+            tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
+        }
         break;
     case INDEX_op_sari_vec:
-        tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
+        } else {
+            tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
+        }
         break;
     case INDEX_op_aa64_sli_vec:
-        tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
+        if (is_scalar) {
+            tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
+        } else {
+            tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
+        }
         break;
     case INDEX_op_shlv_vec:
-        tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_aa64_sshl_vec:
-        tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
+        if (is_scalar) {
+            tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
+        } else {
+            tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
+        }
         break;
     case INDEX_op_cmp_vec:
         {
@@ -XXX,XX +XXX,XX @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 
             if (cond == TCG_COND_NE) {
                 if (const_args[2]) {
-                    tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
+                    if (is_scalar) {
+                        tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
+                    } else {
+                        tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
+                    }
                 } else {
-                    tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
+                    if (is_scalar) {
+                        tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
+                    } else {
+                        tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
+                    }
                     tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
                 }
             } else {
                 if (const_args[2]) {
-                    insn = cmp0_insn[cond];
-                    if (insn) {
-                        tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
-                        break;
+                    if (is_scalar) {
+                        insn = cmp0_scalar_insn[cond];
+                        if (insn) {
+                            tcg_out_insn_3612(s, insn, vece, a0, a1);
+                            break;
+                        }
+                    } else {
+                        insn = cmp0_vec_insn[cond];
+                        if (insn) {
+                            tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
+                            break;
+                        }
                     }
                     tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP, 0);
                     a2 = TCG_VEC_TMP;
                 }
-                insn = cmp_insn[cond];
-                if (insn == 0) {
-                    TCGArg t;
-                    t = a1, a1 = a2, a2 = t;
-                    cond = tcg_swap_cond(cond);
-                    insn = cmp_insn[cond];
-                    tcg_debug_assert(insn != 0);
+                if (is_scalar) {
+                    insn = cmp_scalar_insn[cond];
+                    if (insn == 0) {
+                        TCGArg t;
+                        t = a1, a1 = a2, a2 = t;
+                        cond = tcg_swap_cond(cond);
+                        insn = cmp_scalar_insn[cond];
+                        tcg_debug_assert(insn != 0);
+                    }
+                    tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
+                } else {
+                    insn = cmp_vec_insn[cond];
+                    if (insn == 0) {
+                        TCGArg t;
+                        t = a1, a1 = a2, a2 = t;
+                        cond = tcg_swap_cond(cond);
+                        insn = cmp_vec_insn[cond];
+                        tcg_debug_assert(insn != 0);
+                    }
+                    tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
                 }
-                tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
             }
         }
         break;
-- 
2.25.1

Use the provided cpu_ldst.h interfaces.  This fixes the build vs
the unconverted uses of g2h(), adds missed memory trace events,
and correctly recognizes when a SIGSEGV belongs to the guest via
set_helper_retaddr().

Fixes: 3e8f1628e864
Tested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tci.c | 73 +++++++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 45 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -XXX,XX +XXX,XX @@ static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
     return result;
 }
 
-#ifdef CONFIG_SOFTMMU
-# define qemu_ld_ub \
-    helper_ret_ldub_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_leuw \
-    helper_le_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_leul \
-    helper_le_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_leq \
-    helper_le_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_beuw \
-    helper_be_lduw_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_beul \
-    helper_be_ldul_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_ld_beq \
-    helper_be_ldq_mmu(env, taddr, oi, (uintptr_t)tb_ptr)
-# define qemu_st_b(X) \
-    helper_ret_stb_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_lew(X) \
-    helper_le_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_lel(X) \
-    helper_le_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_leq(X) \
-    helper_le_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_bew(X) \
-    helper_be_stw_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_bel(X) \
-    helper_be_stl_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-# define qemu_st_beq(X) \
-    helper_be_stq_mmu(env, taddr, X, oi, (uintptr_t)tb_ptr)
-#else
-# define qemu_ld_ub      ldub_p(g2h(taddr))
-# define qemu_ld_leuw    lduw_le_p(g2h(taddr))
-# define qemu_ld_leul    (uint32_t)ldl_le_p(g2h(taddr))
-# define qemu_ld_leq     ldq_le_p(g2h(taddr))
-# define qemu_ld_beuw    lduw_be_p(g2h(taddr))
-# define qemu_ld_beul    (uint32_t)ldl_be_p(g2h(taddr))
-# define qemu_ld_beq     ldq_be_p(g2h(taddr))
-# define qemu_st_b(X)    stb_p(g2h(taddr), X)
-# define qemu_st_lew(X)  stw_le_p(g2h(taddr), X)
-# define qemu_st_lel(X)  stl_le_p(g2h(taddr), X)
-# define qemu_st_leq(X)  stq_le_p(g2h(taddr), X)
-# define qemu_st_bew(X)  stw_be_p(g2h(taddr), X)
-# define qemu_st_bel(X)  stl_be_p(g2h(taddr), X)
-# define qemu_st_beq(X)  stq_be_p(g2h(taddr), X)
-#endif
+#define qemu_ld_ub \
+    cpu_ldub_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_leuw \
+    cpu_lduw_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_leul \
+    cpu_ldl_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_leq \
+    cpu_ldq_le_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_beuw \
+    cpu_lduw_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_beul \
+    cpu_ldl_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_ld_beq \
+    cpu_ldq_be_mmuidx_ra(env, taddr, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_b(X) \
+    cpu_stb_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_lew(X) \
+    cpu_stw_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_lel(X) \
+    cpu_stl_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_leq(X) \
+    cpu_stq_le_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_bew(X) \
+    cpu_stw_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_bel(X) \
+    cpu_stl_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
+#define qemu_st_beq(X) \
+    cpu_stq_be_mmuidx_ra(env, taddr, X, get_mmuidx(oi), (uintptr_t)tb_ptr)
 
 #if TCG_TARGET_REG_BITS == 64
 # define CASE_32_64(x) \
-- 
2.25.1

Allow other places in tcg to restart with a smaller tb.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void set_jmp_reset_offset(TCGContext *s, int which)
     s->tb_jmp_reset_offset[which] = tcg_current_code_size(s);
 }
 
+/* Signal overflow, starting over with fewer guest insns. */
+static void QEMU_NORETURN tcg_raise_tb_overflow(TCGContext *s)
+{
+    siglongjmp(s->jmp_trans, -2);
+}
+
 #define C_PFX1(P, A)                    P##A
 #define C_PFX2(P, A, B)                 P##A##_##B
 #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
@@ -XXX,XX +XXX,XX @@ static TCGTemp *tcg_temp_alloc(TCGContext *s)
     int n = s->nb_temps++;
 
     if (n >= TCG_MAX_TEMPS) {
-        /* Signal overflow, starting over with fewer guest insns. */
-        siglongjmp(s->jmp_trans, -2);
+        tcg_raise_tb_overflow(s);
     }
     return memset(&s->temps[n], 0, sizeof(TCGTemp));
 }
-- 
2.25.1

The use in tcg_tb_lookup is given a random pc that comes from the pc
of a signal handler.  Do not assert that the pointer is already within
the code gen buffer at all, much less the writable mirror of it.

Fixes: db0c51a3803
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -XXX,XX +XXX,XX @@ static void tcg_region_trees_init(void)
     }
 }
 
-static struct tcg_region_tree *tc_ptr_to_region_tree(const void *cp)
+static struct tcg_region_tree *tc_ptr_to_region_tree(const void *p)
 {
-    void *p = tcg_splitwx_to_rw(cp);
     size_t region_idx;
 
+    /*
+     * Like tcg_splitwx_to_rw, with no assert.  The pc may come from
+     * a signal handler over which the caller has no control.
+     */
+    if (!in_code_gen_buffer(p)) {
+        p -= tcg_splitwx_diff;
+        if (!in_code_gen_buffer(p)) {
+            return NULL;
+        }
+    }
+
     if (p < region.start_aligned) {
         region_idx = 0;
     } else {
@@ -XXX,XX +XXX,XX @@ void tcg_tb_insert(TranslationBlock *tb)
 {
     struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
 
+    g_assert(rt != NULL);
     qemu_mutex_lock(&rt->lock);
     g_tree_insert(rt->tree, &tb->tc, tb);
     qemu_mutex_unlock(&rt->lock);
@@ -XXX,XX +XXX,XX @@ void tcg_tb_remove(TranslationBlock *tb)
 {
     struct tcg_region_tree *rt = tc_ptr_to_region_tree(tb->tc.ptr);
 
+    g_assert(rt != NULL);
     qemu_mutex_lock(&rt->lock);
     g_tree_remove(rt->tree, &tb->tc);
     qemu_mutex_unlock(&rt->lock);
@@ -XXX,XX +XXX,XX @@ TranslationBlock *tcg_tb_lookup(uintptr_t tc_ptr)
     TranslationBlock *tb;
     struct tb_tc s = { .ptr = (void *)tc_ptr };
 
+    if (rt == NULL) {
+        return NULL;
+    }
+
     qemu_mutex_lock(&rt->lock);
     tb = g_tree_lookup(rt->tree, &s);
     qemu_mutex_unlock(&rt->lock);
-- 
2.25.1