Series comparison

-[PULL 00/24] tcg patch queue
+[PULL 00/56] tcg patch queue
-The following changes since commit 6587b0c1331d427b0939c37e763842550ed581db:
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
-  Merge remote-tracking branch 'remotes/ericb/tags/pull-nbd-2021-10-15' into staging (2021-10-15 14:16:28 -0700)
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
 are available in the Git repository at:
-  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211016
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
-for you to fetch changes up to 995b87dedc78b0467f5f18bbc3546072ba97516a:
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-  Revert "cpu: Move cpu_common_props to hw/core/cpu.c" (2021-10-15 16:39:15 -0700)
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
 ----------------------------------------------------------------
-Move gdb singlestep to generic code
+Improvements to qemu/int128
-Fix cpu_common_props
+Fixes for 128/64 division.
 Cleanup tcg/optimize.c
 Optimize redundant sign extensions
 ----------------------------------------------------------------
-Richard Henderson (24):
+Frédéric Pétrot (1):
-      accel/tcg: Handle gdb singlestep in cpu_tb_exec
+      qemu/int128: Add int128_{not,xor}
       target/alpha: Drop checks for singlestep_enabled
       target/avr: Drop checks for singlestep_enabled
       target/cris: Drop checks for singlestep_enabled
       target/hexagon: Drop checks for singlestep_enabled
       target/arm: Drop checks for singlestep_enabled
       target/hppa: Drop checks for singlestep_enabled
       target/i386: Check CF_NO_GOTO_TB for dc->jmp_opt
       target/i386: Drop check for singlestep_enabled
       target/m68k: Drop checks for singlestep_enabled
       target/microblaze: Check CF_NO_GOTO_TB for DISAS_JUMP
       target/microblaze: Drop checks for singlestep_enabled
       target/mips: Fix single stepping
       target/mips: Drop exit checks for singlestep_enabled
       target/openrisc: Drop checks for singlestep_enabled
       target/ppc: Drop exit checks for singlestep_enabled
       target/riscv: Remove dead code after exception
       target/riscv: Remove exit_tb and lookup_and_goto_ptr
       target/rx: Drop checks for singlestep_enabled
       target/s390x: Drop check for singlestep_enabled
       target/sh4: Drop check for singlestep_enabled
       target/tricore: Drop check for singlestep_enabled
       target/xtensa: Drop check for singlestep_enabled
       Revert "cpu: Move cpu_common_props to hw/core/cpu.c"
- include/hw/core/cpu.h                          |  1 +
+Luis Pires (4):
- target/i386/helper.h                           |  1 -
+      host-utils: move checks out of divu128/divs128
- target/rx/helper.h                             |  1 -
+      host-utils: move udiv_qrnnd() to host-utils
- target/sh4/helper.h                            |  1 -
+      host-utils: add 128-bit quotient support to divu128/divs128
- target/tricore/helper.h                        |  1 -
+      host-utils: add unit tests for divu128/divs128
  accel/tcg/cpu-exec.c                           | 11 ++++
  cpu.c                                          | 21 ++++++++
  hw/core/cpu-common.c                           | 17 +-----
  target/alpha/translate.c                       | 13 ++---
  target/arm/translate-a64.c                     | 10 +---
  target/arm/translate.c                         | 36 +++----------
  target/avr/translate.c                         | 19 ++-----
  target/cris/translate.c                        | 16 ------
  target/hexagon/translate.c                     | 12 +----
  target/hppa/translate.c                        | 17 ++----
  target/i386/tcg/misc_helper.c                  |  8 ---
  target/i386/tcg/translate.c                    |  9 ++--
  target/m68k/translate.c                        | 44 ++++-----------
  target/microblaze/translate.c                  | 18 ++-----
  target/mips/tcg/translate.c                    | 75 ++++++++++++--------------
  target/openrisc/translate.c                    | 18 ++-----
  target/ppc/translate.c                         | 38 +++----------
  target/riscv/translate.c                       | 27 +---------
  target/rx/op_helper.c                          |  8 ---
  target/rx/translate.c                          | 12 +----
  target/s390x/tcg/translate.c                   |  8 +--
  target/sh4/op_helper.c                         |  5 --
  target/sh4/translate.c                         | 14 ++---
  target/tricore/op_helper.c                     |  7 ---
  target/tricore/translate.c                     | 14 +----
  target/xtensa/translate.c                      | 25 +++------
  target/riscv/insn_trans/trans_privileged.c.inc | 10 ++--
  target/riscv/insn_trans/trans_rvi.c.inc        |  8 ++-
  target/riscv/insn_trans/trans_rvv.c.inc        |  2 +-
 files changed, 141 insertions(+), 386 deletions(-)
+Richard Henderson (51):
+      tcg/optimize: Rename "mask" to "z_mask"
+      tcg/optimize: Split out OptContext
+      tcg/optimize: Remove do_default label
+      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+      tcg/optimize: Move prev_mb into OptContext
+      tcg/optimize: Split out init_arguments
+      tcg/optimize: Split out copy_propagate
+      tcg/optimize: Split out fold_call
+      tcg/optimize: Drop nb_oargs, nb_iargs locals
+      tcg/optimize: Change fail return for do_constant_folding_cond*
+      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
+      tcg/optimize: Split out finish_folding
+      tcg/optimize: Use a boolean to avoid a mass of continues
+      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+      tcg/optimize: Split out fold_const{1,2}
+      tcg/optimize: Split out fold_setcond2
+      tcg/optimize: Split out fold_brcond2
+      tcg/optimize: Split out fold_brcond
+      tcg/optimize: Split out fold_setcond
+      tcg/optimize: Split out fold_mulu2_i32
+      tcg/optimize: Split out fold_addsub2_i32
+      tcg/optimize: Split out fold_movcond
+      tcg/optimize: Split out fold_extract2
+      tcg/optimize: Split out fold_extract, fold_sextract
+      tcg/optimize: Split out fold_deposit
+      tcg/optimize: Split out fold_count_zeros
+      tcg/optimize: Split out fold_bswap
+      tcg/optimize: Split out fold_dup, fold_dup2
+      tcg/optimize: Split out fold_mov
+      tcg/optimize: Split out fold_xx_to_i
+      tcg/optimize: Split out fold_xx_to_x
+      tcg/optimize: Split out fold_xi_to_i
+      tcg/optimize: Add type to OptContext
+      tcg/optimize: Split out fold_to_not
+      tcg/optimize: Split out fold_sub_to_neg
+      tcg/optimize: Split out fold_xi_to_x
+      tcg/optimize: Split out fold_ix_to_i
+      tcg/optimize: Split out fold_masks
+      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
+      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
+      tcg/optimize: Sink commutative operand swapping into fold functions
+      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
+      tcg/optimize: Use fold_xx_to_i for orc
+      tcg/optimize: Use fold_xi_to_x for mul
+      tcg/optimize: Use fold_xi_to_x for div
+      tcg/optimize: Use fold_xx_to_i for rem
+      tcg/optimize: Optimize sign extensions
+      tcg/optimize: Propagate sign info for logical operations
+      tcg/optimize: Propagate sign info for setcond
+      tcg/optimize: Propagate sign info for bit counting
+      tcg/optimize: Propagate sign info for shifting
+ include/fpu/softfloat-macros.h |   82 --
+ include/hw/clock.h             |    5 +-
+ include/qemu/host-utils.h      |  121 +-
+ include/qemu/int128.h          |   20 +
+ target/ppc/int_helper.c        |   23 +-
+ tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
+ tests/unit/test-div128.c       |  197 +++
+ util/host-utils.c              |  147 ++-
+ tests/unit/meson.build         |    1 +
+files changed, 2053 insertions(+), 1187 deletions(-)
+ create mode 100644 tests/unit/test-div128.c

-New patch
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
+Addition of not and xor on 128-bit integers.
+Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
+Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
+Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
+[rth: Split out logical operations.]
+Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ include/qemu/int128.h | 20 ++++++++++++++++++++
+file changed, 20 insertions(+)
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/qemu/int128.h
++++ b/include/qemu/int128.h
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
+     return a;
+ }
++static inline Int128 int128_not(Int128 a)
++{
++    return ~a;
++}
++
+ static inline Int128 int128_and(Int128 a, Int128 b)
+ {
+     return a & b;
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
+     return a | b;
+ }
++static inline Int128 int128_xor(Int128 a, Int128 b)
++{
++    return a ^ b;
++}
++
+ static inline Int128 int128_rshift(Int128 a, int n)
+ {
+     return a >> n;
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
+     return int128_make128(a, (a < 0) ? -1 : 0);
+ }
++static inline Int128 int128_not(Int128 a)
++{
++    return int128_make128(~a.lo, ~a.hi);
++}
++
+ static inline Int128 int128_and(Int128 a, Int128 b)
+ {
+     return int128_make128(a.lo & b.lo, a.hi & b.hi);
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
+     return int128_make128(a.lo | b.lo, a.hi | b.hi);
+ }
++static inline Int128 int128_xor(Int128 a, Int128 b)
++{
++    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
++}
++
+ static inline Int128 int128_rshift(Int128 a, int n)
+ {
+     int64_t h;
+--
+.25.1

-New patch
+[PULL 02/56] host-utils: move checks out of divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |  5 +++--
  include/qemu/host-utils.h | 34 ++++++++++++---------------------
  target/ppc/int_helper.c   | 14 +++++++++-----
  util/host-utils.c         | 40 ++++++++++++++++++---------------------
 files changed, 42 insertions(+), 51 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
 --
 .25.1

-New patch
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
+From: Luis Pires <luis.pires@eldorado.org.br>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat-macros.h | 82 ----------------------------------
  include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 files changed, 81 insertions(+), 82 deletions(-)
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat-macros.h
 +++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
   * so some portions are provided under:
   *  the SoftFloat-2a license
   *  the BSD license
 - *  GPL-v2-or-later
   *
   * Any future contributions to this file after December 1st 2014 will be
   * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
   * THE POSSIBILITY OF SUCH DAMAGE.
   */
 -/* Portions of this work are licensed under the terms of the GNU GPL,
 - * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-New patch
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 These will be used to implement new decimal floating point
 instructions from Power ISA 3.1.
 The remainder is now returned directly by divu128/divs128,
 freeing up phigh to receive the high 64 bits of the quotient.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |   6 +-
  include/qemu/host-utils.h |  20 ++++--
  target/ppc/int_helper.c   |   9 +--
  util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 files changed, 108 insertions(+), 60 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
      if (clk->period == 0) {
          return 0;
      }
 -    /*
 -     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-New patch
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
  tests/unit/meson.build   |   1 +
 files changed, 198 insertions(+)
  create mode 100644 tests/unit/test-div128.c
 diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Test 128-bit division functions
 + *
 + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/host-utils.h"
 +
 +typedef struct {
 +    uint64_t high;
 +    uint64_t low;
 +    uint64_t rhigh;
 +    uint64_t rlow;
 +    uint64_t divisor;
 +    uint64_t remainder;
 +} test_data_unsigned;
 +
 +typedef struct {
 +    int64_t high;
 +    uint64_t low;
 +    int64_t rhigh;
 +    uint64_t rlow;
 +    int64_t divisor;
 +    int64_t remainder;
 +} test_data_signed;
 +
 +static const test_data_unsigned test_table_unsigned[] = {
 +    /* Dividend fits in 64 bits */
 +    { 0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000003ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000002ULL, 0x0000000000000001ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0xa000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000002ULL,
 +      0x4000000000000000ULL, 0x2000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x8000000000000000ULL, 0x0000000000000000ULL},
 +
 +    /* Dividend > 64 bits, with MSB 0 */
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x000000000000000dULL,
 +      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
 +      0x0000000000000010ULL, 0x0000000000000001ULL},
 +
 +    /* Dividend > 64 bits, with MSB 1 */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
 +      0x0000000000000010ULL, 0x000000000000000fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
 +      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
 +
 +    /**
 +     * Divisor == 64 bits, with MSB 1
 +     * and high 64 bits of dividend >= divisor
 +     * (for testing normalization)
 +     */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0xfddbb9977553310aULL,
 +      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
 +
 +    /* Dividend > 64 bits, divisor almost as big */
 +    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
 +      0x0000000000000000ULL, 0x000000000000000fULL,
 +      0x123456789abcdefeULL, 0x123456789abcde1fULL},
 +};
 +
 +static const test_data_signed test_table_signed[] = {
 +    /* Positive dividend, positive/negative divisors */
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
 +    int i;
 +    uint64_t rem;
 +    test_data_unsigned tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +static void test_divs128(void)
 +{
 +    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/host-utils/test_divu128", test_divu128);
 +    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/meson.build
 +++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
    # all code tested by test-x86-cpuid is inside topology.h
    'test-x86-cpuid': [],
    'test-cutils': [],
 +  'test-div128': [],
    'test-shift128': [],
    'test-mul64': [],
    # all code tested by test-int128 is inside int128.h
 --
 .25.1

-New patch
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
+Prepare for tracking different masks by renaming this one.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
+file changed, 72 insertions(+), 70 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     TCGTemp *prev_copy;
+     TCGTemp *next_copy;
+     uint64_t val;
+-    uint64_t mask;
++    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
+     ti->next_copy = ts;
+     ti->prev_copy = ts;
+     ti->is_const = false;
+-    ti->mask = -1;
++    ti->z_mask = -1;
+ }
+ static void reset_temp(TCGArg arg)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     if (ts->kind == TEMP_CONST) {
+         ti->is_const = true;
+         ti->val = ts->val;
+-        ti->mask = ts->val;
++        ti->z_mask = ts->val;
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+             /* High bits of a 32-bit quantity are garbage.  */
+-            ti->mask |= ~0xffffffffull;
++            ti->z_mask |= ~0xffffffffull;
+         }
+     } else {
+         ti->is_const = false;
+-        ti->mask = -1;
++        ti->z_mask = -1;
+     }
+ }
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     const TCGOpDef *def;
+     TempOptInfo *di;
+     TempOptInfo *si;
+-    uint64_t mask;
++    uint64_t z_mask;
+     TCGOpcode new_op;
+     if (ts_are_copies(dst_ts, src_ts)) {
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     op->args[0] = dst;
+     op->args[1] = src;
+-    mask = si->mask;
++    z_mask = si->z_mask;
+     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+         /* High bits of the destination are now garbage.  */
+-        mask |= ~0xffffffffull;
++        z_mask |= ~0xffffffffull;
+     }
+-    di->mask = mask;
++    di->z_mask = z_mask;
+     if (src_ts->type == dst_ts->type) {
+         TempOptInfo *ni = ts_info(si->next_copy);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     }
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+-        uint64_t mask, partmask, affected, tmp;
++        uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def = &tcg_op_defs[opc];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
+-        mask = -1;
++        z_mask = -1;
+         affected = -1;
+         switch (opc) {
+         CASE_OP_32_64(ext8s):
+-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         CASE_OP_32_64(ext8u):
+-            mask = 0xff;
++            z_mask = 0xff;
+             goto and_const;
+         CASE_OP_32_64(ext16s):
+-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         CASE_OP_32_64(ext16u):
+-            mask = 0xffff;
++            z_mask = 0xffff;
+             goto and_const;
+         case INDEX_op_ext32s_i64:
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         case INDEX_op_ext32u_i64:
+-            mask = 0xffffffffU;
++            z_mask = 0xffffffffU;
+             goto and_const;
+         CASE_OP_32_64(and):
+-            mask = arg_info(op->args[2])->mask;
++            z_mask = arg_info(op->args[2])->z_mask;
+             if (arg_is_const(op->args[2])) {
+         and_const:
+-                affected = arg_info(op->args[1])->mask & ~mask;
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+             }
+-            mask = arg_info(op->args[1])->mask & mask;
++            z_mask = arg_info(op->args[1])->z_mask & z_mask;
+             break;
+         case INDEX_op_ext_i32_i64:
+-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
++            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
+                 break;
+             }
+             QEMU_FALLTHROUGH;
+         case INDEX_op_extu_i32_i64:
+             /* We do not compute affected as it is a size changing op.  */
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
+             break;
+         CASE_OP_32_64(andc):
+             /* Known-zeros does not imply known-ones.  Therefore unless
+                op->args[2] is constant, we can't infer anything from it.  */
+             if (arg_is_const(op->args[2])) {
+-                mask = ~arg_info(op->args[2])->mask;
++                z_mask = ~arg_info(op->args[2])->z_mask;
+                 goto and_const;
+             }
+             /* But we certainly know nothing outside args[1] may be set. */
+-            mask = arg_info(op->args[1])->mask;
++            z_mask = arg_info(op->args[1])->z_mask;
+             break;
+         case INDEX_op_sar_i32:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 31;
+-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_sar_i64:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 63;
+-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_shr_i32:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 31;
+-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_shr_i64:
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & 63;
+-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
++                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
+             }
+             break;
+         case INDEX_op_extrl_i64_i32:
+-            mask = (uint32_t)arg_info(op->args[1])->mask;
++            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
+             break;
+         case INDEX_op_extrh_i64_i32:
+-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
++            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
+             break;
+         CASE_OP_32_64(shl):
+             if (arg_is_const(op->args[2])) {
+                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
+-                mask = arg_info(op->args[1])->mask << tmp;
++                z_mask = arg_info(op->args[1])->z_mask << tmp;
+             }
+             break;
+         CASE_OP_32_64(neg):
+             /* Set to 1 all bits to the left of the rightmost.  */
+-            mask = -(arg_info(op->args[1])->mask
+-                     & -arg_info(op->args[1])->mask);
++            z_mask = -(arg_info(op->args[1])->z_mask
++                       & -arg_info(op->args[1])->z_mask);
+             break;
+         CASE_OP_32_64(deposit):
+-            mask = deposit64(arg_info(op->args[1])->mask,
+-                             op->args[3], op->args[4],
+-                             arg_info(op->args[2])->mask);
++            z_mask = deposit64(arg_info(op->args[1])->z_mask,
++                               op->args[3], op->args[4],
++                               arg_info(op->args[2])->z_mask);
+             break;
+         CASE_OP_32_64(extract):
+-            mask = extract64(arg_info(op->args[1])->mask,
+-                             op->args[2], op->args[3]);
++            z_mask = extract64(arg_info(op->args[1])->z_mask,
++                               op->args[2], op->args[3]);
+             if (op->args[2] == 0) {
+-                affected = arg_info(op->args[1])->mask & ~mask;
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+             }
+             break;
+         CASE_OP_32_64(sextract):
+-            mask = sextract64(arg_info(op->args[1])->mask,
+-                              op->args[2], op->args[3]);
+-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
+-                affected = arg_info(op->args[1])->mask & ~mask;
++            z_mask = sextract64(arg_info(op->args[1])->z_mask,
++                                op->args[2], op->args[3]);
++            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
++                affected = arg_info(op->args[1])->z_mask & ~z_mask;
+             }
+             break;
+         CASE_OP_32_64(or):
+         CASE_OP_32_64(xor):
+-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
++            z_mask = arg_info(op->args[1])->z_mask
++                   | arg_info(op->args[2])->z_mask;
+             break;
+         case INDEX_op_clz_i32:
+         case INDEX_op_ctz_i32:
+-            mask = arg_info(op->args[2])->mask | 31;
++            z_mask = arg_info(op->args[2])->z_mask | 31;
+             break;
+         case INDEX_op_clz_i64:
+         case INDEX_op_ctz_i64:
+-            mask = arg_info(op->args[2])->mask | 63;
++            z_mask = arg_info(op->args[2])->z_mask | 63;
+             break;
+         case INDEX_op_ctpop_i32:
+-            mask = 32 | 31;
++            z_mask = 32 | 31;
+             break;
+         case INDEX_op_ctpop_i64:
+-            mask = 64 | 63;
++            z_mask = 64 | 63;
+             break;
+         CASE_OP_32_64(setcond):
+         case INDEX_op_setcond2_i32:
+-            mask = 1;
++            z_mask = 1;
+             break;
+         CASE_OP_32_64(movcond):
+-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
++            z_mask = arg_info(op->args[3])->z_mask
++                   | arg_info(op->args[4])->z_mask;
+             break;
+         CASE_OP_32_64(ld8u):
+-            mask = 0xff;
++            z_mask = 0xff;
+             break;
+         CASE_OP_32_64(ld16u):
+-            mask = 0xffff;
++            z_mask = 0xffff;
+             break;
+         case INDEX_op_ld32u_i64:
+-            mask = 0xffffffffu;
++            z_mask = 0xffffffffu;
+             break;
+         CASE_OP_32_64(qemu_ld):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                 MemOp mop = get_memop(oi);
+                 if (!(mop & MO_SIGN)) {
+-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
++                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                 }
+             }
+             break;
+         CASE_OP_32_64(bswap16):
+-            mask = arg_info(op->args[1])->mask;
+-            if (mask <= 0xffff) {
++            z_mask = arg_info(op->args[1])->z_mask;
++            if (z_mask <= 0xffff) {
+                 op->args[2] |= TCG_BSWAP_IZ;
+             }
+-            mask = bswap16(mask);
++            z_mask = bswap16(z_mask);
+             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+             case TCG_BSWAP_OZ:
+                 break;
+             case TCG_BSWAP_OS:
+-                mask = (int16_t)mask;
++                z_mask = (int16_t)z_mask;
+                 break;
+             default: /* undefined high bits */
+-                mask |= MAKE_64BIT_MASK(16, 48);
++                z_mask |= MAKE_64BIT_MASK(16, 48);
+                 break;
+             }
+             break;
+         case INDEX_op_bswap32_i64:
+-            mask = arg_info(op->args[1])->mask;
+-            if (mask <= 0xffffffffu) {
++            z_mask = arg_info(op->args[1])->z_mask;
++            if (z_mask <= 0xffffffffu) {
+                 op->args[2] |= TCG_BSWAP_IZ;
+             }
+-            mask = bswap32(mask);
++            z_mask = bswap32(z_mask);
+             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+             case TCG_BSWAP_OZ:
+                 break;
+             case TCG_BSWAP_OS:
+-                mask = (int32_t)mask;
++                z_mask = (int32_t)z_mask;
+                 break;
+             default: /* undefined high bits */
+-                mask |= MAKE_64BIT_MASK(32, 32);
++                z_mask |= MAKE_64BIT_MASK(32, 32);
+                 break;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         /* 32-bit ops generate 32-bit results.  For the result is zero test
+            below, we can ignore high bits, but for further optimizations we
+            need to record that the high bits contain garbage.  */
+-        partmask = mask;
++        partmask = z_mask;
+         if (!(def->flags & TCG_OPF_64BIT)) {
+-            mask |= ~(tcg_target_ulong)0xffffffffu;
++            z_mask |= ~(tcg_target_ulong)0xffffffffu;
+             partmask &= 0xffffffffu;
+             affected &= 0xffffffffu;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                    vs the high word of the input.  */
+             do_setcond_high:
+                 reset_temp(op->args[0]);
+-                arg_info(op->args[0])->mask = 1;
++                arg_info(op->args[0])->z_mask = 1;
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+-                arg_info(op->args[0])->mask = 1;
++                arg_info(op->args[0])->z_mask = 1;
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             /* Default case: we know nothing about operation (or were unable
+                to compute the operation result) so no propagation is done.
+                We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "mask" is
++               block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+                 memset(&temps_used, 0, sizeof(temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     /* Save the corresponding known-zero bits mask for the
+                        first output argument (only one supported so far). */
+                     if (i == 0) {
+-                        arg_info(op->args[i])->mask = mask;
++                        arg_info(op->args[i])->z_mask = z_mask;
+                     }
+                 }
+             }
+--
+.25.1

-New patch
+[PULL 07/56] tcg/optimize: Split out OptContext
+Provide what will become a larger context for splitting
+the very large tcg_optimize function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
+file changed, 40 insertions(+), 37 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
++typedef struct OptContext {
++    TCGTempSet temps_used;
++} OptContext;
++
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+ {
+     return ts->state_ptr;
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+ }
+ /* Initialize and activate a temporary.  */
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+ {
+     size_t idx = temp_idx(ts);
+     TempOptInfo *ti;
+-    if (test_bit(idx, temps_used->l)) {
++    if (test_bit(idx, ctx->temps_used.l)) {
+         return;
+     }
+-    set_bit(idx, temps_used->l);
++    set_bit(idx, ctx->temps_used.l);
+     ti = ts->state_ptr;
+     if (ti == NULL) {
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     }
+ }
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
++static void init_arg_info(OptContext *ctx, TCGArg arg)
+ {
+-    init_ts_info(temps_used, arg_temp(arg));
++    init_ts_info(ctx, arg_temp(arg));
+ }
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     }
+ }
+-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
++static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
+                              TCGOp *op, TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+-    init_ts_info(temps_used, tv);
++    init_ts_info(ctx, tv);
+     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+     TCGOp *op, *op_next, *prev_mb = NULL;
+-    TCGTempSet temps_used;
++    OptContext ctx = {};
+     /* Array VALS has an element for each temp.
+        If this temp holds a constant then its value is kept in VALS' element.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     nb_temps = s->nb_temps;
+     nb_globals = s->nb_globals;
+-    memset(&temps_used, 0, sizeof(temps_used));
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                 TCGTemp *ts = arg_temp(op->args[i]);
+                 if (ts) {
+-                    init_ts_info(&temps_used, ts);
++                    init_ts_info(&ctx, ts);
+                 }
+             }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&temps_used, op->args[i]);
++                init_arg_info(&ctx, op->args[i]);
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(rotr):
+             if (arg_is_const(op->args[1])
+                 && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (partmask == 0) {
+             tcg_debug_assert(nb_oargs == 1);
+-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(mulsh):
+             if (arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+         CASE_OP_32_64_VEC(xor):
+             if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 TCGArg v = arg_info(op->args[1])->val;
+                 if (v != 0) {
+                     tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = deposit64(arg_info(op->args[1])->val,
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                            op->args[1], op->args[2]);
+             if (tmp != 2) {
+                 if (tmp) {
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[3];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+                 if (tmp) {
+             do_brcond_true:
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[5];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     goto do_default;
+                 }
+             do_brcond_low:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                             op->args[5]);
+             if (tmp != 2) {
+             do_setcond_const:
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+             } else if ((op->args[5] == TCG_COND_LT
+                         || op->args[5] == TCG_COND_GE)
+                        && arg_is_const(op->args[3])
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (!(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, temps_used.l)) {
++                    if (test_bit(i, ctx.temps_used.l)) {
+                         reset_ts(&s->temps[i]);
+                     }
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+             } else {
+         do_reset_output:
+                 for (i = 0; i < nb_oargs; i++) {
+--
+.25.1

-New patch
+[PULL 08/56] tcg/optimize: Remove do_default label
+Break the final cleanup clause out of the main switch
+statement.  When fully folding an opcode to mov/movi,
+use "continue" to process the next opcode, else break
+to fall into the final cleanup.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
+file changed, 94 insertions(+), 96 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+-            break;
++            continue;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+-                break;
++                continue;
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+                 nb_iargs = 1;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(not):
+         CASE_OP_32_64(neg):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(add):
+         CASE_OP_32_64(sub):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(deposit):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(sextract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract2):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(setcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(brcond):
+             tmp = do_constant_folding_cond(opc, op->args[0],
+                                            op->args[1], op->args[2]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[3];
+-                } else {
+-                    tcg_op_remove(s, op);
+-                }
++            switch (tmp) {
++            case 0:
++                tcg_op_remove(s, op);
++                continue;
++            case 1:
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[3];
+                 break;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(movcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[5]);
+             if (tmp != 2) {
+                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+-                break;
++                continue;
+             }
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+                 uint64_t tv = arg_info(op->args[3])->val;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (fv == 1 && tv == 0) {
+                     cond = tcg_invert_cond(cond);
+                 } else if (!(tv == 1 && fv == 0)) {
+-                    goto do_default;
++                    break;
+                 }
+                 op->args[3] = cond;
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                  : INDEX_op_setcond_i64);
+                 nb_iargs = 2;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_add2_i32:
+         case INDEX_op_sub2_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_mulu2_i32:
+             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_brcond2_i32:
+             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                             op->args[4]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-            do_brcond_true:
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[5];
+-                } else {
++            if (tmp == 0) {
+             do_brcond_false:
+-                    tcg_op_remove(s, op);
+-                }
+-            } else if ((op->args[4] == TCG_COND_LT
+-                        || op->args[4] == TCG_COND_GE)
+-                       && arg_is_const(op->args[2])
+-                       && arg_info(op->args[2])->val == 0
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0) {
++                tcg_op_remove(s, op);
++                continue;
++            }
++            if (tmp == 1) {
++            do_brcond_true:
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[5];
++                break;
++            }
++            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
++                 && arg_is_const(op->args[2])
++                 && arg_info(op->args[2])->val == 0
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
++                op->opc = opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_brcond_false;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_brcond_low:
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_brcond_true;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+             do_setcond_const:
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-            } else if ((op->args[5] == TCG_COND_LT
+-                        || op->args[5] == TCG_COND_GE)
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0
+-                       && arg_is_const(op->args[4])
+-                       && arg_info(op->args[4])->val == 0) {
++                continue;
++            }
++            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0
++                 && arg_is_const(op->args[4])
++                 && arg_info(op->args[4])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_setcond_high:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_setcond_high;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_setcond_const;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+-        case INDEX_op_call:
+-            if (!(tcg_call_flags(op)
++        default:
++            break;
++        }
++
++        /* Some of the folding above can change opc. */
++        opc = op->opc;
++        def = &tcg_op_defs[opc];
++        if (def->flags & TCG_OPF_BB_END) {
++            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++        } else {
++            if (opc == INDEX_op_call &&
++                !(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+                     if (test_bit(i, ctx.temps_used.l)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     }
+                 }
+             }
+-            goto do_reset_output;
+-        default:
+-        do_default:
+-            /* Default case: we know nothing about operation (or were unable
+-               to compute the operation result) so no propagation is done.
+-               We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "z_mask" is
+-               the non-zero bits mask for the first output arg.  */
+-            if (def->flags & TCG_OPF_BB_END) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-            } else {
+-        do_reset_output:
+-                for (i = 0; i < nb_oargs; i++) {
+-                    reset_temp(op->args[i]);
+-                    /* Save the corresponding known-zero bits mask for the
+-                       first output argument (only one supported so far). */
+-                    if (i == 0) {
+-                        arg_info(op->args[i])->z_mask = z_mask;
+-                    }
++            for (i = 0; i < nb_oargs; i++) {
++                reset_temp(op->args[i]);
++                /* Save the corresponding known-zero bits mask for the
++                   first output argument (only one supported so far). */
++                if (i == 0) {
++                    arg_info(op->args[i])->z_mask = z_mask;
+                 }
+             }
+-            break;
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+--
+.25.1

-New patch
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+Adjust the interface to take the OptContext parameter instead
 of TCGContext or both.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 file changed, 34 insertions(+), 33 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
  } TempOptInfo;
  typedef struct OptContext {
 +    TCGContext *tcg;
      TCGTempSet temps_used;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
      return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
 -static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
 -        tcg_op_remove(s, op);
 +        tcg_op_remove(ctx->tcg, op);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
 -static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
 -                             TCGOp *op, TCGArg dst, uint64_t val)
 +static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
 +                             TCGArg dst, uint64_t val)
  {
      const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
      init_ts_info(ctx, tv);
 -    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals, i;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    OptContext ctx = {};
 +    OptContext ctx = { .tcg = s };
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(or):
          CASE_OP_32_64_VEC(and):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             allocator where needed and possible.  Also detect copies. */
          switch (opc) {
          CASE_OP_32_64_VEC(mov):
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-New patch
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
+This will expose the variable to subroutines that
+will be broken out of tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+ typedef struct OptContext {
+     TCGContext *tcg;
++    TCGOp *prev_mb;
+     TCGTempSet temps_used;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+-    TCGOp *op, *op_next, *prev_mb = NULL;
++    TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+     /* Array VALS has an element for each temp.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+-        if (prev_mb) {
++        if (ctx.prev_mb) {
+             switch (opc) {
+             case INDEX_op_mb:
+                 /* Merge two barriers of the same type into one,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                  * barrier.  This is stricter than specified but for
+                  * the purposes of TCG is better than not optimizing.
+                  */
+-                prev_mb->args[0] |= op->args[0];
++                ctx.prev_mb->args[0] |= op->args[0];
+                 tcg_op_remove(s, op);
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i64:
+             case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+-                prev_mb = NULL;
++                ctx.prev_mb = NULL;
+                 break;
+             }
+         } else if (opc == INDEX_op_mb) {
+-            prev_mb = op;
++            ctx.prev_mb = op;
+         }
+     }
+ }
+--
+.25.1

-[PULL 10/24] target/m68k: Drop checks for singlestep_enabled
+[PULL 11/56] tcg/optimize: Split out init_arguments
-GDB single-stepping is now handled generically.
+There was no real reason for calls to have separate code here.
 Unify init for calls vs non-calls using the call path, which
 handles TCG_CALL_DUMMY_ARG.
-Acked-by: Laurent Vivier <laurent@vivier.eu>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/m68k/translate.c | 44 +++++++++--------------------------------
+ tcg/optimize.c | 25 +++++++++++--------------
-file changed, 9 insertions(+), 35 deletions(-)
+file changed, 11 insertions(+), 14 deletions(-)
-diff --git a/target/m68k/translate.c b/target/m68k/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/m68k/translate.c
+--- a/tcg/optimize.c
-+++ b/target/m68k/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void do_writebacks(DisasContext *s)
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
      }
  }
--static bool is_singlestepping(DisasContext *s)
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
 -{
--    /*
+-    init_ts_info(ctx, arg_temp(arg));
 -     * Return true if we are singlestepping either because of
 -     * architectural singlestep or QEMU gdbstub singlestep. This does
 -     * not include the command line '-singlestep' mode which is rather
 -     * misnamed as it only means "one instruction per TB" and doesn't
 -     * affect the code we generate.
 -     */
 -    return s->base.singlestep_enabled || s->ss_active;
 -}
 -
- /* is_jmp field values */
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
- #define DISAS_JUMP      DISAS_TARGET_0 /* only pc was modified dynamically */
+ {
- #define DISAS_EXIT      DISAS_TARGET_1 /* cpu state was modified dynamically */
+     TCGTemp *i, *g, *l;
-@@ -XXX,XX +XXX,XX @@ static void gen_exception(DisasContext *s, uint32_t dest, int nr)
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
-     s->base.is_jmp = DISAS_NORETURN;
+     return false;
  }
--static void gen_singlestep_exception(DisasContext *s)
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
--{
++{
--    /*
++    for (int i = 0; i < nb_args; i++) {
--     * Generate the right kind of exception for singlestep, which is
++        TCGTemp *ts = arg_temp(op->args[i]);
--     * either the architectural singlestep or EXCP_DEBUG for QEMU's
++        if (ts) {
--     * gdb singlestepping.
++            init_ts_info(ctx, ts);
--     */
++        }
--    if (s->ss_active) {
++    }
--        gen_raise_exception(EXCP_TRACE);
++}
--    } else {
++
--        gen_raise_exception(EXCP_DEBUG);
+ /* Propagate constants and copies, fold constant expressions. */
--    }
+ void tcg_optimize(TCGContext *s)
 -}
 -
  static inline void gen_addr_fault(DisasContext *s)
  {
-     gen_exception(s, s->base.pc_next, EXCP_ADDRESS);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static void gen_exit_tb(DisasContext *s)
+         if (opc == INDEX_op_call) {
- /* Generate a jump to an immediate address.  */
+             nb_oargs = TCGOP_CALLO(op);
- static void gen_jmp_tb(DisasContext *s, int n, uint32_t dest)
+             nb_iargs = TCGOP_CALLI(op);
- {
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
--    if (unlikely(is_singlestepping(s))) {
+-                TCGTemp *ts = arg_temp(op->args[i]);
-+    if (unlikely(s->ss_active)) {
+-                if (ts) {
-         update_cc_op(s);
+-                    init_ts_info(&ctx, ts);
-         tcg_gen_movi_i32(QREG_PC, dest);
+-                }
--        gen_singlestep_exception(s);
+-            }
 +        gen_raise_exception(EXCP_TRACE);
      } else if (translator_use_goto_tb(&s->base, dest)) {
          tcg_gen_goto_tb(n);
          tcg_gen_movi_i32(QREG_PC, dest);
@@ -XXX,XX +XXX,XX @@ static void m68k_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
      dc->ss_active = (M68K_SR_TRACE(env->sr) == M68K_SR_TRACE_ANY_INS);
      /* If architectural single step active, limit to 1 */
 -    if (is_singlestepping(dc)) {
 +    if (dc->ss_active) {
          dc->base.max_insns = 1;
      }
  }
@@ -XXX,XX +XXX,XX @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
          break;
      case DISAS_TOO_MANY:
          update_cc_op(dc);
 -        if (is_singlestepping(dc)) {
 +        if (dc->ss_active) {
              tcg_gen_movi_i32(QREG_PC, dc->pc);
 -            gen_singlestep_exception(dc);
 +            gen_raise_exception(EXCP_TRACE);
          } else {
-             gen_jmp_tb(dc, 0, dc->pc);
+             nb_oargs = def->nb_oargs;
              nb_iargs = def->nb_iargs;
 -            for (i = 0; i < nb_oargs + nb_iargs; i++) {
 -                init_arg_info(&ctx, op->args[i]);
 -            }
          }
-         break;
++        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-     case DISAS_JUMP:
-         /* We updated CC_OP and PC in gen_jmp/gen_jmp_im.  */
+         /* Do copy propagation */
--        if (is_singlestepping(dc)) {
+         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 -            gen_singlestep_exception(dc);
 +        if (dc->ss_active) {
 +            gen_raise_exception(EXCP_TRACE);
          } else {
              tcg_gen_lookup_and_goto_ptr();
          }
@@ -XXX,XX +XXX,XX @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
           * We updated CC_OP and PC in gen_exit_tb, but also modified
           * other state that may require returning to the main loop.
           */
 -        if (is_singlestepping(dc)) {
 -            gen_singlestep_exception(dc);
 +        if (dc->ss_active) {
 +            gen_raise_exception(EXCP_TRACE);
          } else {
              tcg_gen_exit_tb(NULL, 0);
          }
 --
 .25.1

-New patch
+[PULL 12/56] tcg/optimize: Split out copy_propagate
+Continue splitting tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 22 ++++++++++++++--------
+file changed, 14 insertions(+), 8 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+     }
+ }
++static void copy_propagate(OptContext *ctx, TCGOp *op,
++                           int nb_oargs, int nb_iargs)
++{
++    TCGContext *s = ctx->tcg;
++
++    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
++        TCGTemp *ts = arg_temp(op->args[i]);
++        if (ts && ts_is_copy(ts)) {
++            op->args[i] = temp_arg(find_better_copy(s, ts));
++        }
++    }
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             nb_iargs = def->nb_iargs;
+         }
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-
+-        /* Do copy propagation */
+-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+-            TCGTemp *ts = arg_temp(op->args[i]);
+-            if (ts && ts_is_copy(ts)) {
+-                op->args[i] = temp_arg(find_better_copy(s, ts));
+-            }
+-        }
++        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+--
+.25.1

-New patch
+[PULL 13/56] tcg/optimize: Split out fold_call
+Calls are special in that they have a variable number
+of arguments, and need to be able to clobber globals.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
+file changed, 41 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static bool fold_call(OptContext *ctx, TCGOp *op)
++{
++    TCGContext *s = ctx->tcg;
++    int nb_oargs = TCGOP_CALLO(op);
++    int nb_iargs = TCGOP_CALLI(op);
++    int flags, i;
++
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
++    copy_propagate(ctx, op, nb_oargs, nb_iargs);
++
++    /* If the function reads or writes globals, reset temp data. */
++    flags = tcg_call_flags(op);
++    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
++        int nb_globals = s->nb_globals;
++
++        for (i = 0; i < nb_globals; i++) {
++            if (test_bit(i, ctx->temps_used.l)) {
++                reset_ts(&ctx->tcg->temps[i]);
++            }
++        }
++    }
++
++    /* Reset temp data for outputs. */
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++    }
++
++    /* Stop optimizing MB across calls. */
++    ctx->prev_mb = NULL;
++    return true;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+-    int nb_temps, nb_globals, i;
++    int nb_temps, i;
+     TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+        available through the doubly linked circular list. */
+     nb_temps = s->nb_temps;
+-    nb_globals = s->nb_globals;
+-
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+-        const TCGOpDef *def = &tcg_op_defs[opc];
++        const TCGOpDef *def;
+-        /* Count the arguments, and initialize the temps that are
+-           going to be used */
++        /* Calls are special. */
+         if (opc == INDEX_op_call) {
+-            nb_oargs = TCGOP_CALLO(op);
+-            nb_iargs = TCGOP_CALLI(op);
+-        } else {
+-            nb_oargs = def->nb_oargs;
+-            nb_iargs = def->nb_iargs;
++            fold_call(&ctx, op);
++            continue;
+         }
++
++        def = &tcg_op_defs[opc];
++        nb_oargs = def->nb_oargs;
++        nb_iargs = def->nb_iargs;
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
+-            if (opc == INDEX_op_call &&
+-                !(tcg_call_flags(op)
+-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+-                for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, ctx.temps_used.l)) {
+-                        reset_ts(&s->temps[i]);
+-                    }
+-                }
+-            }
+-
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i32:
+             case INDEX_op_qemu_st8_i32:
+             case INDEX_op_qemu_st_i64:
+-            case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+                 ctx.prev_mb = NULL;
+                 break;
+--
+.25.1

-New patch
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
+Rather than try to keep these up-to-date across folding,
+re-read nb_oargs at the end, after re-reading the opcode.
+A couple of asserts need dropping, but that will take care
+of itself as we split the function further.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 14 ++++----------
+file changed, 4 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
+         uint64_t z_mask, partmask, affected, tmp;
+-        int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         def = &tcg_op_defs[opc];
+-        nb_oargs = def->nb_oargs;
+-        nb_iargs = def->nb_iargs;
+-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
++        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
++        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(qemu_ld):
+             {
+-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
++                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+                 MemOp mop = get_memop(oi);
+                 if (!(mop & MO_SIGN)) {
+                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         if (partmask == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+-            tcg_debug_assert(nb_oargs == 1);
+             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             continue;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+-                nb_iargs = 1;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+                                  ? INDEX_op_setcond_i32
+                                  : INDEX_op_setcond_i64);
+-                nb_iargs = 2;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
++            int nb_oargs = def->nb_oargs;
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+--
+.25.1

-[PULL 14/24] target/mips: Drop exit checks for singlestep_enabled
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
-GDB single-stepping is now handled generically.
+Return -1 instead of 2 for failure, so that we can
+use comparisons against 0 for all cases.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/mips/tcg/translate.c | 50 +++++++++++++------------------------
+ tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
-file changed, 18 insertions(+), 32 deletions(-)
+file changed, 74 insertions(+), 71 deletions(-)
-diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.c
+--- a/tcg/optimize.c
-+++ b/target/mips/tcg/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
          tcg_gen_exit_tb(ctx->base.tb, n);
      } else {
          gen_save_pc(dest);
 -        if (ctx->base.singlestep_enabled) {
 -            save_cpu_state(ctx, 0);
 -            gen_helper_raise_exception_debug(cpu_env);
 -        } else {
 -            tcg_gen_lookup_and_goto_ptr();
 -        }
 +        tcg_gen_lookup_and_goto_ptr();
      }
  }
-@@ -XXX,XX +XXX,XX @@ static void gen_branch(DisasContext *ctx, int insn_bytes)
+-/* Return 2 if the condition can't be simplified, and the result
-             } else {
+-   of the condition (0 or 1) if it can */
-                 tcg_gen_mov_tl(cpu_PC, btarget);
+-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-             }
+-                                       TCGArg y, TCGCond c)
--            if (ctx->base.singlestep_enabled) {
++/*
--                save_cpu_state(ctx, 0);
++ * Return -1 if the condition can't be simplified,
--                gen_helper_raise_exception_debug(cpu_env);
++ * and the result of the condition (0 or 1) if it can.
--            }
++ */
-             tcg_gen_lookup_and_goto_ptr();
++static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
-             break;
++                                    TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
          case TCG_COND_GEU:
              return 1;
          default:
-@@ -XXX,XX +XXX,XX @@ static void mips_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+-            return 2;
 +            return -1;
          }
      }
 -    return 2;
 +    return -1;
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
  {
-     DisasContext *ctx = container_of(dcbase, DisasContext, base);
+     TCGArg al = p1[0], ah = p1[1];
+     TCGArg bl = p2[0], bh = p2[1];
--    if (ctx->base.singlestep_enabled && ctx->base.is_jmp != DISAS_NORETURN) {
+@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
--        save_cpu_state(ctx, ctx->base.is_jmp != DISAS_EXIT);
+     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
--        gen_helper_raise_exception_debug(cpu_env);
+         return do_constant_folding_cond_eq(c);
 -    } else {
 -        switch (ctx->base.is_jmp) {
 -        case DISAS_STOP:
 -            gen_save_pc(ctx->base.pc_next);
 -            tcg_gen_lookup_and_goto_ptr();
 -            break;
 -        case DISAS_NEXT:
 -        case DISAS_TOO_MANY:
 -            save_cpu_state(ctx, 0);
 -            gen_goto_tb(ctx, 0, ctx->base.pc_next);
 -            break;
 -        case DISAS_EXIT:
 -            tcg_gen_exit_tb(NULL, 0);
 -            break;
 -        case DISAS_NORETURN:
 -            break;
 -        default:
 -            g_assert_not_reached();
 -        }
 +    switch (ctx->base.is_jmp) {
 +    case DISAS_STOP:
 +        gen_save_pc(ctx->base.pc_next);
 +        tcg_gen_lookup_and_goto_ptr();
 +        break;
 +    case DISAS_NEXT:
 +    case DISAS_TOO_MANY:
 +        save_cpu_state(ctx, 0);
 +        gen_goto_tb(ctx, 0, ctx->base.pc_next);
 +        break;
 +    case DISAS_EXIT:
 +        tcg_gen_exit_tb(NULL, 0);
 +        break;
 +    case DISAS_NORETURN:
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
+-    return 2;
++    return -1;
  }
+ static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         CASE_OP_32_64(setcond):
+-            tmp = do_constant_folding_cond(opc, op->args[1],
+-                                           op->args[2], op->args[3]);
+-            if (tmp != 2) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
++            i = do_constant_folding_cond(opc, op->args[1],
++                                         op->args[2], op->args[3]);
++            if (i >= 0) {
++                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+                 continue;
+             }
+             break;
+         CASE_OP_32_64(brcond):
+-            tmp = do_constant_folding_cond(opc, op->args[0],
+-                                           op->args[1], op->args[2]);
+-            switch (tmp) {
+-            case 0:
++            i = do_constant_folding_cond(opc, op->args[0],
++                                         op->args[1], op->args[2]);
++            if (i == 0) {
+                 tcg_op_remove(s, op);
+                 continue;
+-            case 1:
++            } else if (i > 0) {
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = opc = INDEX_op_br;
+                 op->args[0] = op->args[3];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         CASE_OP_32_64(movcond):
+-            tmp = do_constant_folding_cond(opc, op->args[1],
+-                                           op->args[2], op->args[5]);
+-            if (tmp != 2) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
++            i = do_constant_folding_cond(opc, op->args[1],
++                                         op->args[2], op->args[5]);
++            if (i >= 0) {
++                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
+                 continue;
+             }
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         case INDEX_op_brcond2_i32:
+-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                            op->args[4]);
+-            if (tmp == 0) {
++            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
++                                          op->args[4]);
++            if (i == 0) {
+             do_brcond_false:
+                 tcg_op_remove(s, op);
+                 continue;
+             }
+-            if (tmp == 1) {
++            if (i > 0) {
+             do_brcond_true:
+                 op->opc = opc = INDEX_op_br;
+                 op->args[0] = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (op->args[4] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                               op->args[0], op->args[2],
+-                                               TCG_COND_EQ);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
++                                             op->args[0], op->args[2],
++                                             TCG_COND_EQ);
++                if (i == 0) {
+                     goto do_brcond_false;
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
+                     goto do_brcond_high;
+                 }
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                               op->args[1], op->args[3],
+-                                               TCG_COND_EQ);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
++                                             op->args[1], op->args[3],
++                                             TCG_COND_EQ);
++                if (i == 0) {
+                     goto do_brcond_false;
+-                } else if (tmp != 1) {
++                } else if (i < 0) {
+                     break;
+                 }
+             do_brcond_low:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (op->args[4] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                               op->args[0], op->args[2],
+-                                               TCG_COND_NE);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
++                                             op->args[0], op->args[2],
++                                             TCG_COND_NE);
++                if (i == 0) {
+                     goto do_brcond_high;
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
+                     goto do_brcond_true;
+                 }
+-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                               op->args[1], op->args[3],
+-                                               TCG_COND_NE);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_brcond_i32,
++                                             op->args[1], op->args[3],
++                                             TCG_COND_NE);
++                if (i == 0) {
+                     goto do_brcond_low;
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
+                     goto do_brcond_true;
+                 }
+             }
+             break;
+         case INDEX_op_setcond2_i32:
+-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
+-                                            op->args[5]);
+-            if (tmp != 2) {
++            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
++                                          op->args[5]);
++            if (i >= 0) {
+             do_setcond_const:
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
++                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+                 continue;
+             }
+             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (op->args[5] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                               op->args[1], op->args[3],
+-                                               TCG_COND_EQ);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_setcond_i32,
++                                             op->args[1], op->args[3],
++                                             TCG_COND_EQ);
++                if (i == 0) {
+                     goto do_setcond_const;
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
+                     goto do_setcond_high;
+                 }
+-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                               op->args[2], op->args[4],
+-                                               TCG_COND_EQ);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_setcond_i32,
++                                             op->args[2], op->args[4],
++                                             TCG_COND_EQ);
++                if (i == 0) {
+                     goto do_setcond_high;
+-                } else if (tmp != 1) {
++                } else if (i < 0) {
+                     break;
+                 }
+             do_setcond_low:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (op->args[5] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                               op->args[1], op->args[3],
+-                                               TCG_COND_NE);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_setcond_i32,
++                                             op->args[1], op->args[3],
++                                             TCG_COND_NE);
++                if (i == 0) {
+                     goto do_setcond_high;
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
+                     goto do_setcond_const;
+                 }
+-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                               op->args[2], op->args[4],
+-                                               TCG_COND_NE);
+-                if (tmp == 0) {
++                i = do_constant_folding_cond(INDEX_op_setcond_i32,
++                                             op->args[2], op->args[4],
++                                             TCG_COND_NE);
++                if (i == 0) {
+                     goto do_setcond_low;
+-                } else if (tmp == 1) {
++                } else if (i > 0) {
+                     goto do_setcond_const;
+                 }
+             }
 --
 .25.1

-New patch
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
+This will allow callers to tail call to these functions
+and return true indicating processing complete.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 +++++----
+file changed, 5 insertions(+), 4 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
+ }
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+ {
+     TCGTemp *dst_ts = arg_temp(dst);
+     TCGTemp *src_ts = arg_temp(src);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+     if (ts_are_copies(dst_ts, src_ts)) {
+         tcg_op_remove(ctx->tcg, op);
+-        return;
++        return true;
+     }
+     reset_ts(dst_ts);
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+         di->is_const = si->is_const;
+         di->val = si->val;
+     }
++    return true;
+ }
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
++static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                              TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+     init_ts_info(ctx, tv);
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+ }
+ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
+--
+.25.1

-New patch
+[PULL 17/56] tcg/optimize: Split out finish_folding
+Copy z_mask into OptContext, for writeback to the
+first output within the new function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
+file changed, 33 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
+     TCGContext *tcg;
+     TCGOp *prev_mb;
+     TCGTempSet temps_used;
++
++    /* In flight values from optimization. */
++    uint64_t z_mask;
+ } OptContext;
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static void finish_folding(OptContext *ctx, TCGOp *op)
++{
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
++    int i, nb_oargs;
++
++    /*
++     * For an opcode that ends a BB, reset all temp data.
++     * We do no cross-BB optimization.
++     */
++    if (def->flags & TCG_OPF_BB_END) {
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
++        ctx->prev_mb = NULL;
++        return;
++    }
++
++    nb_oargs = def->nb_oargs;
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++        /*
++         * Save the corresponding known-zero bits mask for the
++         * first output argument (only one supported so far).
++         */
++        if (i == 0) {
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
++        }
++    }
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             partmask &= 0xffffffffu;
+             affected &= 0xffffffffu;
+         }
++        ctx.z_mask = z_mask;
+         if (partmask == 0) {
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Some of the folding above can change opc. */
+-        opc = op->opc;
+-        def = &tcg_op_defs[opc];
+-        if (def->flags & TCG_OPF_BB_END) {
+-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-        } else {
+-            int nb_oargs = def->nb_oargs;
+-            for (i = 0; i < nb_oargs; i++) {
+-                reset_temp(op->args[i]);
+-                /* Save the corresponding known-zero bits mask for the
+-                   first output argument (only one supported so far). */
+-                if (i == 0) {
+-                    arg_info(op->args[i])->z_mask = z_mask;
+-                }
+-            }
+-        }
++        finish_folding(&ctx, op);
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-New patch
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++++++---
+file changed, 6 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
++        bool done = false;
+         /* Calls are special. */
+         if (opc == INDEX_op_call) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+            allocator where needed and possible.  Also detect copies. */
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
++            break;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        finish_folding(&ctx, op);
++        if (!done) {
++            finish_folding(&ctx, op);
++        }
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-[PULL 13/24] target/mips: Fix single stepping
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
-As per an ancient comment in mips_tr_translate_insn about the
+This puts the separate mb optimization into the same framework
-expectations of gdb, when restarting the insn in a delay slot
+as the others.  While fold_qemu_{ld,st} are currently identical,
-we also re-execute the branch.  Which means that we are
+that won't last as more code gets moved.
 expected to execute two insns in this case.
-This has been broken since 8b86d6d2580, where we forced max_insns
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 to 1 while single-stepping.  This resulted in an exit from the
 translator loop after the branch but before the delay slot is
 translated.
 Increase the max_insns to 2 for this case.  In addition, bypass
 the end-of-page check, for when the branch itself ends the page.
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/mips/tcg/translate.c | 25 ++++++++++++++++---------
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
-file changed, 16 insertions(+), 9 deletions(-)
+file changed, 51 insertions(+), 38 deletions(-)
-diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/mips/tcg/translate.c
+--- a/tcg/optimize.c
-+++ b/target/mips/tcg/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void mips_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
-     ctx->default_tcg_memop_mask = (ctx->insn_flags & (ISA_MIPS_R6 |
+     return true;
-                                   INSN_LOONGSON3A)) ? MO_UNALN : MO_ALIGN;
+ }
-+    /*
++static bool fold_mb(OptContext *ctx, TCGOp *op)
-+     * Execute a branch and its delay slot as a single instruction.
++{
-+     * This is what GDB expects and is consistent with what the
++    /* Eliminate duplicate and redundant fence instructions.  */
-+     * hardware does (e.g. if a delay slot instruction faults, the
++    if (ctx->prev_mb) {
-+     * reported PC is the PC of the branch).
++        /*
-+     */
++         * Merge two barriers of the same type into one,
-+    if (ctx->base.singlestep_enabled && (ctx->hflags & MIPS_HFLAG_BMASK)) {
++         * or a weaker barrier into a stronger one,
-+        ctx->base.max_insns = 2;
++         * or two weaker barriers into a stronger one.
 +         *   mb X; mb Y => mb X|Y
 +         *   mb; strl => mb; st
 +         *   ldaq; mb => ld; mb
 +         *   ldaq; strl => ld; mb; st
 +         * Other combinations are also merged into a strong
 +         * barrier.  This is stricter than specified but for
 +         * the purposes of TCG is better than not optimizing.
 +         */
 +        ctx->prev_mb->args[0] |= op->args[0];
 +        tcg_op_remove(ctx->tcg, op);
 +    } else {
 +        ctx->prev_mb = op;
 +    }
++    return true;
++}
 +
-     LOG_DISAS("\ntb %p idx %d hflags %04x\n", ctx->base.tb, ctx->mem_idx,
++static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
-               ctx->hflags);
++{
- }
++    /* Opcodes that touch guest memory stop the mb optimization.  */
-@@ -XXX,XX +XXX,XX @@ static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
++    ctx->prev_mb = NULL;
-     if (ctx->base.is_jmp != DISAS_NEXT) {
++    return false;
-         return;
++}
      }
 +
-     /*
++static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
--     * Execute a branch and its delay slot as a single instruction.
++{
--     * This is what GDB expects and is consistent with what the
++    /* Opcodes that touch guest memory stop the mb optimization.  */
--     * hardware does (e.g. if a delay slot instruction faults, the
++    ctx->prev_mb = NULL;
--     * reported PC is the PC of the branch).
++    return false;
-+     * End the TB on (most) page crossings.
++}
-+     * See mips_tr_init_disas_context about single-stepping a branch
++
-+     * together with its delay slot.
+ /* Propagate constants and copies, fold constant expressions. */
-      */
+ void tcg_optimize(TCGContext *s)
--    if (ctx->base.singlestep_enabled &&
+ {
--        (ctx->hflags & MIPS_HFLAG_BMASK) == 0) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
--        ctx->base.is_jmp = DISAS_TOO_MANY;
+             }
--    }
+             break;
--    if (ctx->base.pc_next - ctx->page_start >= TARGET_PAGE_SIZE) {
-+    if (ctx->base.pc_next - ctx->page_start >= TARGET_PAGE_SIZE
++        case INDEX_op_mb:
-+        && !ctx->base.singlestep_enabled) {
++            done = fold_mb(&ctx, op);
-         ctx->base.is_jmp = DISAS_TOO_MANY;
++            break;
 +        case INDEX_op_qemu_ld_i32:
 +        case INDEX_op_qemu_ld_i64:
 +            done = fold_qemu_ld(&ctx, op);
 +            break;
 +        case INDEX_op_qemu_st_i32:
 +        case INDEX_op_qemu_st8_i32:
 +        case INDEX_op_qemu_st_i64:
 +            done = fold_qemu_st(&ctx, op);
 +            break;
 +
          default:
              break;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (!done) {
              finish_folding(&ctx, op);
          }
 -
 -        /* Eliminate duplicate and redundant fence instructions.  */
 -        if (ctx.prev_mb) {
 -            switch (opc) {
 -            case INDEX_op_mb:
 -                /* Merge two barriers of the same type into one,
 -                 * or a weaker barrier into a stronger one,
 -                 * or two weaker barriers into a stronger one.
 -                 *   mb X; mb Y => mb X|Y
 -                 *   mb; strl => mb; st
 -                 *   ldaq; mb => ld; mb
 -                 *   ldaq; strl => ld; mb; st
 -                 * Other combinations are also merged into a strong
 -                 * barrier.  This is stricter than specified but for
 -                 * the purposes of TCG is better than not optimizing.
 -                 */
 -                ctx.prev_mb->args[0] |= op->args[0];
 -                tcg_op_remove(s, op);
 -                break;
 -
 -            default:
 -                /* Opcodes that end the block stop the optimization.  */
 -                if ((def->flags & TCG_OPF_BB_END) == 0) {
 -                    break;
 -                }
 -                /* fallthru */
 -            case INDEX_op_qemu_ld_i32:
 -            case INDEX_op_qemu_ld_i64:
 -            case INDEX_op_qemu_st_i32:
 -            case INDEX_op_qemu_st8_i32:
 -            case INDEX_op_qemu_st_i64:
 -                /* Opcodes that touch guest memory stop the optimization.  */
 -                ctx.prev_mb = NULL;
 -                break;
 -            }
 -        } else if (opc == INDEX_op_mb) {
 -            ctx.prev_mb = op;
 -        }
      }
  }
 --
 .25.1

-[PULL 18/24] target/riscv: Remove exit_tb and lookup_and_goto_ptr
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
-GDB single-stepping is now handled generically, which means
+Split out a whole bunch of placeholder functions, which are
-we don't need to do anything in the wrappers.
+currently identical.  That won't last as more code gets moved.
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Use CASE_32_64_VEC for some logical operators that previously
 missed the addition of vectors.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/riscv/translate.c                      | 27 +------------------
+ tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
- .../riscv/insn_trans/trans_privileged.c.inc   |  4 +--
+file changed, 219 insertions(+), 52 deletions(-)
- target/riscv/insn_trans/trans_rvi.c.inc       |  8 +++---
- target/riscv/insn_trans/trans_rvv.c.inc       |  2 +-
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 files changed, 7 insertions(+), 34 deletions(-)
 diff --git a/target/riscv/translate.c b/target/riscv/translate.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/translate.c
+--- a/tcg/optimize.c
-+++ b/target/riscv/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void generate_exception_mtval(DisasContext *ctx, int excp)
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      ctx->base.is_jmp = DISAS_NORETURN;
  }
 -static void gen_exception_debug(void)
 -{
 -    gen_helper_raise_exception(cpu_env, tcg_constant_i32(EXCP_DEBUG));
 -}
 -
 -/* Wrapper around tcg_gen_exit_tb that handles single stepping */
 -static void exit_tb(DisasContext *ctx)
 -{
 -    if (ctx->base.singlestep_enabled) {
 -        gen_exception_debug();
 -    } else {
 -        tcg_gen_exit_tb(NULL, 0);
 -    }
 -}
 -
 -/* Wrapper around tcg_gen_lookup_and_goto_ptr that handles single stepping */
 -static void lookup_and_goto_ptr(DisasContext *ctx)
 -{
 -    if (ctx->base.singlestep_enabled) {
 -        gen_exception_debug();
 -    } else {
 -        tcg_gen_lookup_and_goto_ptr();
 -    }
 -}
 -
  static void gen_exception_illegal(DisasContext *ctx)
  {
      generate_exception(ctx, RISCV_EXCP_ILLEGAL_INST);
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
          tcg_gen_exit_tb(ctx->base.tb, n);
      } else {
          tcg_gen_movi_tl(cpu_pc, dest);
 -        lookup_and_goto_ptr(ctx);
 +        tcg_gen_lookup_and_goto_ptr();
      }
  }
-diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc
++/*
-index XXXXXXX..XXXXXXX 100644
++ * The fold_* functions return true when processing is complete,
---- a/target/riscv/insn_trans/trans_privileged.c.inc
++ * usually by folding the operation to a constant or to a copy,
-+++ b/target/riscv/insn_trans/trans_privileged.c.inc
++ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
-@@ -XXX,XX +XXX,XX @@ static bool trans_sret(DisasContext *ctx, arg_sret *a)
++ * like collect information about the value produced, for use in
++ * optimizing a subsequent operation.
-     if (has_ext(ctx, RVS)) {
++ *
-         gen_helper_sret(cpu_pc, cpu_env, cpu_pc);
++ * These first fold_* functions are all helpers, used by other
--        exit_tb(ctx); /* no chaining */
++ * folders for more specific operations.
-+        tcg_gen_exit_tb(NULL, 0); /* no chaining */
++ */
-         ctx->base.is_jmp = DISAS_NORETURN;
++
-     } else {
++static bool fold_const1(OptContext *ctx, TCGOp *op)
-         return false;
++{
-@@ -XXX,XX +XXX,XX @@ static bool trans_mret(DisasContext *ctx, arg_mret *a)
++    if (arg_is_const(op->args[1])) {
- #ifndef CONFIG_USER_ONLY
++        uint64_t t;
-     tcg_gen_movi_tl(cpu_pc, ctx->base.pc_next);
++
-     gen_helper_mret(cpu_pc, cpu_env, cpu_pc);
++        t = arg_info(op->args[1])->val;
--    exit_tb(ctx); /* no chaining */
++        t = do_constant_folding(op->opc, t, 0);
-+    tcg_gen_exit_tb(NULL, 0); /* no chaining */
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
-     ctx->base.is_jmp = DISAS_NORETURN;
++    }
-     return true;
++    return false;
- #else
++}
-diff --git a/target/riscv/insn_trans/trans_rvi.c.inc b/target/riscv/insn_trans/trans_rvi.c.inc
++
-index XXXXXXX..XXXXXXX 100644
++static bool fold_const2(OptContext *ctx, TCGOp *op)
---- a/target/riscv/insn_trans/trans_rvi.c.inc
++{
-+++ b/target/riscv/insn_trans/trans_rvi.c.inc
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-@@ -XXX,XX +XXX,XX @@ static bool trans_jalr(DisasContext *ctx, arg_jalr *a)
++        uint64_t t1 = arg_info(op->args[1])->val;
-     if (a->rd != 0) {
++        uint64_t t2 = arg_info(op->args[2])->val;
-         tcg_gen_movi_tl(cpu_gpr[a->rd], ctx->pc_succ_insn);
++
-     }
++        t1 = do_constant_folding(op->opc, t1, t2);
--
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
--    /* No chaining with JALR. */
++    }
--    lookup_and_goto_ptr(ctx);
++    return false;
-+    tcg_gen_lookup_and_goto_ptr();
++}
++
-     if (misaligned) {
++/*
-         gen_set_label(misaligned);
++ * These outermost fold_<op> functions are sorted alphabetically.
-@@ -XXX,XX +XXX,XX @@ static bool trans_fence_i(DisasContext *ctx, arg_fence_i *a)
++ */
-      * however we need to end the translation block
++
-      */
++static bool fold_add(OptContext *ctx, TCGOp *op)
-     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn);
++{
--    exit_tb(ctx);
++    return fold_const2(ctx, op);
-+    tcg_gen_exit_tb(NULL, 0);
++}
-     ctx->base.is_jmp = DISAS_NORETURN;
++
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
      return true;
  }
-@@ -XXX,XX +XXX,XX @@ static bool do_csr_post(DisasContext *ctx)
 +static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_divide(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_eqv(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_exts(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_extu(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
  static bool fold_mb(OptContext *ctx, TCGOp *op)
  {
-     /* We may have changed important cpu state -- exit to main loop. */
+     /* Eliminate duplicate and redundant fence instructions.  */
-     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn);
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
 -    exit_tb(ctx);
 +    tcg_gen_exit_tb(NULL, 0);
      ctx->base.is_jmp = DISAS_NORETURN;
      return true;
  }
-diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
-index XXXXXXX..XXXXXXX 100644
++static bool fold_mul(OptContext *ctx, TCGOp *op)
---- a/target/riscv/insn_trans/trans_rvv.c.inc
++{
-+++ b/target/riscv/insn_trans/trans_rvv.c.inc
++    return fold_const2(ctx, op);
-@@ -XXX,XX +XXX,XX @@ static bool trans_vsetvl(DisasContext *ctx, arg_vsetvl *a)
++}
-     gen_set_gpr(ctx, a->rd, dst);
++
++static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn);
++{
--    lookup_and_goto_ptr(ctx);
++    return fold_const2(ctx, op);
-+    tcg_gen_lookup_and_goto_ptr();
++}
-     ctx->base.is_jmp = DISAS_NORETURN;
++
-     return true;
++static bool fold_nand(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_neg(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
      return false;
  }
++static bool fold_remainder(OptContext *ctx, TCGOp *op)
++{
++    return fold_const2(ctx, op);
++}
++
++static bool fold_shift(OptContext *ctx, TCGOp *op)
++{
++    return fold_const2(ctx, op);
++}
++
++static bool fold_sub(OptContext *ctx, TCGOp *op)
++{
++    return fold_const2(ctx, op);
++}
++
++static bool fold_xor(OptContext *ctx, TCGOp *op)
++{
++    return fold_const2(ctx, op);
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(not):
+-        CASE_OP_32_64(neg):
+-        CASE_OP_32_64(ext8s):
+-        CASE_OP_32_64(ext8u):
+-        CASE_OP_32_64(ext16s):
+-        CASE_OP_32_64(ext16u):
+-        CASE_OP_32_64(ctpop):
+-        case INDEX_op_ext32s_i64:
+-        case INDEX_op_ext32u_i64:
+-        case INDEX_op_ext_i32_i64:
+-        case INDEX_op_extu_i32_i64:
+-        case INDEX_op_extrl_i64_i32:
+-        case INDEX_op_extrh_i64_i32:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+         case INDEX_op_bswap64_i64:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(add):
+-        CASE_OP_32_64(sub):
+-        CASE_OP_32_64(mul):
+-        CASE_OP_32_64(or):
+-        CASE_OP_32_64(and):
+-        CASE_OP_32_64(xor):
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
+-        CASE_OP_32_64(sar):
+-        CASE_OP_32_64(rotl):
+-        CASE_OP_32_64(rotr):
+-        CASE_OP_32_64(andc):
+-        CASE_OP_32_64(orc):
+-        CASE_OP_32_64(eqv):
+-        CASE_OP_32_64(nand):
+-        CASE_OP_32_64(nor):
+-        CASE_OP_32_64(muluh):
+-        CASE_OP_32_64(mulsh):
+-        CASE_OP_32_64(div):
+-        CASE_OP_32_64(divu):
+-        CASE_OP_32_64(rem):
+-        CASE_OP_32_64(remu):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
++        default:
++            break;
++
++        /* ---------------------------------------------------------- */
++        /* Sorted alphabetically by opcode as much as possible. */
++
++        CASE_OP_32_64_VEC(add):
++            done = fold_add(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(and):
++            done = fold_and(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(andc):
++            done = fold_andc(&ctx, op);
++            break;
++        CASE_OP_32_64(ctpop):
++            done = fold_ctpop(&ctx, op);
++            break;
++        CASE_OP_32_64(div):
++        CASE_OP_32_64(divu):
++            done = fold_divide(&ctx, op);
++            break;
++        CASE_OP_32_64(eqv):
++            done = fold_eqv(&ctx, op);
++            break;
++        CASE_OP_32_64(ext8s):
++        CASE_OP_32_64(ext16s):
++        case INDEX_op_ext32s_i64:
++        case INDEX_op_ext_i32_i64:
++            done = fold_exts(&ctx, op);
++            break;
++        CASE_OP_32_64(ext8u):
++        CASE_OP_32_64(ext16u):
++        case INDEX_op_ext32u_i64:
++        case INDEX_op_extu_i32_i64:
++        case INDEX_op_extrl_i64_i32:
++        case INDEX_op_extrh_i64_i32:
++            done = fold_extu(&ctx, op);
++            break;
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(mul):
++            done = fold_mul(&ctx, op);
++            break;
++        CASE_OP_32_64(mulsh):
++        CASE_OP_32_64(muluh):
++            done = fold_mul_highpart(&ctx, op);
++            break;
++        CASE_OP_32_64(nand):
++            done = fold_nand(&ctx, op);
++            break;
++        CASE_OP_32_64(neg):
++            done = fold_neg(&ctx, op);
++            break;
++        CASE_OP_32_64(nor):
++            done = fold_nor(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(not):
++            done = fold_not(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(or):
++            done = fold_or(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(orc):
++            done = fold_orc(&ctx, op);
++            break;
+         case INDEX_op_qemu_ld_i32:
+         case INDEX_op_qemu_ld_i64:
+             done = fold_qemu_ld(&ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_qemu_st_i64:
+             done = fold_qemu_st(&ctx, op);
+             break;
+-
+-        default:
++        CASE_OP_32_64(rem):
++        CASE_OP_32_64(remu):
++            done = fold_remainder(&ctx, op);
++            break;
++        CASE_OP_32_64(rotl):
++        CASE_OP_32_64(rotr):
++        CASE_OP_32_64(sar):
++        CASE_OP_32_64(shl):
++        CASE_OP_32_64(shr):
++            done = fold_shift(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(sub):
++            done = fold_sub(&ctx, op);
++            break;
++        CASE_OP_32_64_VEC(xor):
++            done = fold_xor(&ctx, op);
+             break;
+         }
 --
 .25.1

-New patch
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
+file changed, 72 insertions(+), 73 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_setcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
++            goto do_setcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            goto do_setcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
++                                     op->args[4], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            op->args[2] = op->args[3];
++            op->args[3] = cond;
++            op->opc = INDEX_op_setcond_i32;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_setcond_high:
++        op->args[1] = op->args[2];
++        op->args[2] = op->args[4];
++        op->args[3] = cond;
++        op->opc = INDEX_op_setcond_i32;
++        break;
++    }
++    return false;
++
++ do_setcond_const:
++    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_setcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+-                                          op->args[5]);
+-            if (i >= 0) {
+-            do_setcond_const:
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0
+-                 && arg_is_const(op->args[4])
+-                 && arg_info(op->args[4])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_setcond_high:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_const;
+-                } else if (i > 0) {
+-                    goto do_setcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_setcond_low:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[2] = op->args[3];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_low;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        case INDEX_op_setcond2_i32:
++            done = fold_setcond2(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
+file changed, 81 insertions(+), 78 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[4];
++    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
++    TCGArg label = op->args[5];
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_brcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
++            goto do_brcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
++                                     op->args[2], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            goto do_brcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            op->opc = INDEX_op_brcond_i32;
++            op->args[1] = op->args[2];
++            op->args[2] = cond;
++            op->args[3] = label;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_brcond_high:
++        op->opc = INDEX_op_brcond_i32;
++        op->args[0] = op->args[1];
++        op->args[1] = op->args[3];
++        op->args[2] = cond;
++        op->args[3] = label;
++        break;
++
++    do_brcond_const:
++        if (i == 0) {
++            tcg_op_remove(ctx->tcg, op);
++            return true;
++        }
++        op->opc = INDEX_op_br;
++        op->args[0] = label;
++        break;
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_brcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                          op->args[4]);
+-            if (i == 0) {
+-            do_brcond_false:
+-                tcg_op_remove(s, op);
+-                continue;
+-            }
+-            if (i > 0) {
+-            do_brcond_true:
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[5];
+-                break;
+-            }
+-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+-                 && arg_is_const(op->args[2])
+-                 && arg_info(op->args[2])->val == 0
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_brcond_high:
+-                op->opc = opc = INDEX_op_brcond_i32;
+-                op->args[0] = op->args[1];
+-                op->args[1] = op->args[3];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i > 0) {
+-                    goto do_brcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_brcond_low:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_high;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_low;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        case INDEX_op_brcond2_i32:
++            done = fold_brcond2(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
+file changed, 21 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
++        uint32_t a = arg_info(op->args[2])->val;
++        uint32_t b = arg_info(op->args[3])->val;
++        uint64_t r = (uint64_t)a * b;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++        return true;
++    }
++    return false;
++}
++
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_mulu2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-                uint32_t a = arg_info(op->args[2])->val;
+-                uint32_t b = arg_info(op->args[3])->val;
+-                uint64_t r = (uint64_t)a * b;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
++        case INDEX_op_mulu2_i32:
++            done = fold_mulu2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
+which will not be simple wrappers forever.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
+file changed, 44 insertions(+), 26 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
++        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
++        uint32_t al = arg_info(op->args[2])->val;
++        uint32_t ah = arg_info(op->args[3])->val;
++        uint32_t bl = arg_info(op->args[4])->val;
++        uint32_t bh = arg_info(op->args[5])->val;
++        uint64_t a = ((uint64_t)ah << 32) | al;
++        uint64_t b = ((uint64_t)bh << 32) | bl;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        if (add) {
++            a += b;
++        } else {
++            a -= b;
++        }
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
++        return true;
++    }
++    return false;
++}
++
++static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, true);
++}
++
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, false);
++}
++
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_add2_i32:
+-        case INDEX_op_sub2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+-                uint32_t al = arg_info(op->args[2])->val;
+-                uint32_t ah = arg_info(op->args[3])->val;
+-                uint32_t bl = arg_info(op->args[4])->val;
+-                uint32_t bh = arg_info(op->args[5])->val;
+-                uint64_t a = ((uint64_t)ah << 32) | al;
+-                uint64_t b = ((uint64_t)bh << 32) | bl;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                if (opc == INDEX_op_add2_i32) {
+-                    a += b;
+-                } else {
+-                    a -= b;
+-                }
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
+-                continue;
+-            }
+-            break;
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
++        case INDEX_op_add2_i32:
++            done = fold_add2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
++        case INDEX_op_sub2_i32:
++            done = fold_sub2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 27/56] tcg/optimize: Split out fold_movcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
+file changed, 31 insertions(+), 25 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_movcond(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode opc = op->opc;
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
++    }
++
++    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
++        uint64_t tv = arg_info(op->args[3])->val;
++        uint64_t fv = arg_info(op->args[4])->val;
++
++        opc = (opc == INDEX_op_movcond_i32
++               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
++
++        if (tv == 1 && fv == 0) {
++            op->opc = opc;
++            op->args[3] = cond;
++        } else if (fv == 1 && tv == 0) {
++            op->opc = opc;
++            op->args[3] = tcg_invert_cond(cond);
++        }
++    }
++    return false;
++}
++
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(movcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[5]);
+-            if (i >= 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
+-                continue;
+-            }
+-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-                uint64_t tv = arg_info(op->args[3])->val;
+-                uint64_t fv = arg_info(op->args[4])->val;
+-                TCGCond cond = op->args[5];
+-
+-                if (fv == 1 && tv == 0) {
+-                    cond = tcg_invert_cond(cond);
+-                } else if (!(tv == 1 && fv == 0)) {
+-                    break;
+-                }
+-                op->args[3] = cond;
+-                op->opc = opc = (opc == INDEX_op_movcond_i32
+-                                 ? INDEX_op_setcond_i32
+-                                 : INDEX_op_setcond_i64);
+-            }
+-            break;
+-
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(movcond):
++            done = fold_movcond(&ctx, op);
++            break;
+         CASE_OP_32_64(mul):
+             done = fold_mul(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 28/56] tcg/optimize: Split out fold_extract2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
+file changed, 22 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t v1 = arg_info(op->args[1])->val;
++        uint64_t v2 = arg_info(op->args[2])->val;
++        int shr = op->args[3];
++
++        if (op->opc == INDEX_op_extract2_i64) {
++            v1 >>= shr;
++            v2 <<= 64 - shr;
++        } else {
++            v1 = (uint32_t)v1 >> shr;
++            v2 = (int32_t)v2 << (32 - shr);
++        }
++        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
++    }
++    return false;
++}
++
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract2):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                uint64_t v1 = arg_info(op->args[1])->val;
+-                uint64_t v2 = arg_info(op->args[2])->val;
+-                int shr = op->args[3];
+-
+-                if (opc == INDEX_op_extract2_i64) {
+-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
+-                } else {
+-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+-                                    ((uint32_t)v2 << (32 - shr)));
+-                }
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract2):
++            done = fold_extract2(&ctx, op);
++            break;
+         CASE_OP_32_64(ext8s):
+         CASE_OP_32_64(ext16s):
+         case INDEX_op_ext32s_i64:
+--
+.25.1

-New patch
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
+file changed, 30 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = extract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ {
+     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+ }
++static bool fold_sextract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = sextract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = extract64(arg_info(op->args[1])->val,
+-                                op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        CASE_OP_32_64(sextract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = sextract64(arg_info(op->args[1])->val,
+-                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract):
++            done = fold_extract(&ctx, op);
++            break;
+         CASE_OP_32_64(extract2):
+             done = fold_extract2(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(sextract):
++            done = fold_sextract(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 30/56] tcg/optimize: Split out fold_deposit
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 25 +++++++++++++++----------
+file changed, 15 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+     return fold_const1(ctx, op);
+ }
++static bool fold_deposit(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t1 = arg_info(op->args[1])->val;
++        uint64_t t2 = arg_info(op->args[2])->val;
++
++        t1 = deposit64(t1, op->args[3], op->args[4], t2);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
++    }
++    return false;
++}
++
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(deposit):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tmp = deposit64(arg_info(op->args[1])->val,
+-                                op->args[3], op->args[4],
+-                                arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
++        CASE_OP_32_64(deposit):
++            done = fold_deposit(&ctx, op);
++            break;
+         CASE_OP_32_64(div):
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+--
+.25.1

-New patch
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
+file changed, 18 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        if (t != 0) {
++            t = do_constant_folding(op->opc, t, 0);
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++        }
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
++    }
++    return false;
++}
++
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(clz):
+-        CASE_OP_32_64(ctz):
+-            if (arg_is_const(op->args[1])) {
+-                TCGArg v = arg_info(op->args[1])->val;
+-                if (v != 0) {
+-                    tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                } else {
+-                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
+-                }
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(clz):
++        CASE_OP_32_64(ctz):
++            done = fold_count_zeros(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 32/56] tcg/optimize: Split out fold_bswap
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++++-----------
+file changed, 16 insertions(+), 11 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++static bool fold_bswap(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        t = do_constant_folding(op->opc, t, op->args[2]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(bswap16):
+-        CASE_OP_32_64(bswap32):
+-        case INDEX_op_bswap64_i64:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          op->args[2]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(bswap16):
++        CASE_OP_32_64(bswap32):
++        case INDEX_op_bswap64_i64:
++            done = fold_bswap(&ctx, op);
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             done = fold_count_zeros(&ctx, op);
+--
+.25.1

-New patch
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
+file changed, 31 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_dup(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++        t = dup_const(TCGOP_VECE(op), t);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
++static bool fold_dup2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
++                               arg_info(op->args[2])->val);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++
++    if (args_are_copies(op->args[1], op->args[2])) {
++        op->opc = INDEX_op_dup_vec;
++        TCGOP_VECE(op) = MO_32;
++    }
++    return false;
++}
++
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             break;
+-        case INDEX_op_dup_vec:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = arg_info(op->args[1])->val;
+-                tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        case INDEX_op_dup2_vec:
+-            assert(TCG_TARGET_REG_BITS == 32);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0],
+-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+-                                           arg_info(op->args[2])->val));
+-                continue;
+-            } else if (args_are_copies(op->args[1], op->args[2])) {
+-                op->opc = INDEX_op_dup_vec;
+-                TCGOP_VECE(op) = MO_32;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+             break;
++        case INDEX_op_dup_vec:
++            done = fold_dup(&ctx, op);
++            break;
++        case INDEX_op_dup2_vec:
++            done = fold_dup2(&ctx, op);
++            break;
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
+--
+.25.1

-[PULL 07/24] target/hppa: Drop checks for singlestep_enabled
+[PULL 34/56] tcg/optimize: Split out fold_mov
-GDB single-stepping is now handled generically.
+This is the final entry in the main switch that was in a
 different form.  After this, we have the option to convert
 the switch into a function dispatch table.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/hppa/translate.c | 17 ++++-------------
+ tcg/optimize.c | 27 ++++++++++++++-------------
-file changed, 4 insertions(+), 13 deletions(-)
+file changed, 14 insertions(+), 13 deletions(-)
-diff --git a/target/hppa/translate.c b/target/hppa/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/hppa/translate.c
+--- a/tcg/optimize.c
-+++ b/target/hppa/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int which,
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
-     } else {
+     return true;
          copy_iaoq_entry(cpu_iaoq_f, f, cpu_iaoq_b);
          copy_iaoq_entry(cpu_iaoq_b, b, ctx->iaoq_n_var);
 -        if (ctx->base.singlestep_enabled) {
 -            gen_excp_1(EXCP_DEBUG);
 -        } else {
 -            tcg_gen_lookup_and_goto_ptr();
 -        }
 +        tcg_gen_lookup_and_goto_ptr();
      }
  }
-@@ -XXX,XX +XXX,XX @@ static bool do_rfi(DisasContext *ctx, bool rfi_r)
++static bool fold_mov(OptContext *ctx, TCGOp *op)
-         gen_helper_rfi(cpu_env);
++{
-     }
++    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-     /* Exit the TB to recognize new interrupts.  */
++}
--    if (ctx->base.singlestep_enabled) {
++
--        gen_excp_1(EXCP_DEBUG);
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
--    } else {
+ {
--        tcg_gen_exit_tb(NULL, 0);
+     TCGOpcode opc = op->opc;
--    }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+    tcg_gen_exit_tb(NULL, 0);
+             break;
-     ctx->base.is_jmp = DISAS_NORETURN;
+         }
-     return nullify_end(ctx);
+-        /* Propagate constants through copy operations and do constant
-@@ -XXX,XX +XXX,XX @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+-           folding.  Constants will be substituted to arguments by register
-         nullify_save(ctx);
+-           allocator where needed and possible.  Also detect copies. */
-         /* FALLTHRU */
++        /*
-     case DISAS_IAQ_N_UPDATED:
++         * Process each opcode.
--        if (ctx->base.singlestep_enabled) {
++         * Sorted alphabetically by opcode as much as possible.
--            gen_excp_1(EXCP_DEBUG);
++         */
--        } else if (is_jmp != DISAS_IAQ_N_STALE_EXIT) {
+         switch (opc) {
-+        if (is_jmp != DISAS_IAQ_N_STALE_EXIT) {
+-        CASE_OP_32_64_VEC(mov):
-             tcg_gen_lookup_and_goto_ptr();
+-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            break;
 -
 -        default:
 -            break;
 -
 -        /* ---------------------------------------------------------- */
 -        /* Sorted alphabetically by opcode as much as possible. */
 -
          CASE_OP_32_64_VEC(add):
              done = fold_add(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64_VEC(mov):
 +            done = fold_mov(&ctx, op);
 +            break;
          CASE_OP_32_64(movcond):
              done = fold_movcond(&ctx, op);
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(xor):
              done = fold_xor(&ctx, op);
              break;
 +        default:
 +            break;
          }
-         /* FALLTHRU */
-     case DISAS_EXIT:
+         if (!done) {
 --
 .25.1

-[PULL 24/24] Revert "cpu: Move cpu_common_props to hw/core/cpu.c"
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
-This reverts commit 1b36e4f5a5de585210ea95f2257839c2312be28f.
+Pull the "op r, a, a => movi r, 0" optimization into a function,
 and use it in the outer opcode fold functions.
-Despite a comment saying why cpu_common_props cannot be placed in
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-a file that is compiled once, it was moved anyway.  Revert that.
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Since then, Property is not defined in hw/core/cpu.h, so it is now
 easier to declare a function to install the properties rather than
 the Property array itself.
 Cc: Eduardo Habkost <ehabkost@redhat.com>
 Suggested-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h |  1 +
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
- cpu.c                 | 21 +++++++++++++++++++++
+file changed, 24 insertions(+), 17 deletions(-)
  hw/core/cpu-common.c  | 17 +----------------
 files changed, 23 insertions(+), 16 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void QEMU_NORETURN cpu_abort(CPUState *cpu, const char *fmt, ...)
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-     GCC_FMT_ATTR(2, 3);
+     return false;
  /* $(top_srcdir)/cpu.c */
 +void cpu_class_init_props(DeviceClass *dc);
  void cpu_exec_initfn(CPUState *cpu);
  void cpu_exec_realizefn(CPUState *cpu, Error **errp);
  void cpu_exec_unrealizefn(CPUState *cpu);
 diff --git a/cpu.c b/cpu.c
 index XXXXXXX..XXXXXXX 100644
 --- a/cpu.c
 +++ b/cpu.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_unrealizefn(CPUState *cpu)
      cpu_list_remove(cpu);
  }
-+static Property cpu_common_props[] = {
++/* If the binary operation has both arguments equal, fold to @i. */
-+#ifndef CONFIG_USER_ONLY
++static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 +    /*
 +     * Create a memory property for softmmu CPU object,
 +     * so users can wire up its memory. (This can't go in hw/core/cpu.c
 +     * because that file is compiled only once for both user-mode
 +     * and system builds.) The default if no link is set up is to use
 +     * the system address space.
 +     */
 +    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 +                     MemoryRegion *),
 +#endif
 +    DEFINE_PROP_BOOL("start-powered-off", CPUState, start_powered_off, false),
 +    DEFINE_PROP_END_OF_LIST(),
 +};
 +
 +void cpu_class_init_props(DeviceClass *dc)
 +{
-+    device_class_set_props(dc, cpu_common_props);
++    if (args_are_copies(op->args[1], op->args[2])) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
- void cpu_exec_initfn(CPUState *cpu)
+ /*
   * These outermost fold_<op> functions are sorted alphabetically.
   */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
-     cpu->as = NULL;
+-    return fold_const2(ctx, op);
-diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
++    if (fold_const2(ctx, op) ||
-index XXXXXXX..XXXXXXX 100644
++        fold_xx_to_i(ctx, op, 0)) {
---- a/hw/core/cpu-common.c
++        return true;
-+++ b/hw/core/cpu-common.c
++    }
-@@ -XXX,XX +XXX,XX @@ static int64_t cpu_common_get_arch_id(CPUState *cpu)
++    return false;
      return cpu->cpu_index;
  }
--static Property cpu_common_props[] = {
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
--#ifndef CONFIG_USER_ONLY
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
--    /* Create a memory property for softmmu CPU object,
--     * so users can wire up its memory. (This can't go in hw/core/cpu.c
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
--     * because that file is compiled only once for both user-mode
+ {
--     * and system builds.) The default if no link is set up is to use
+-    return fold_const2(ctx, op);
--     * the system address space.
++    if (fold_const2(ctx, op) ||
--     */
++        fold_xx_to_i(ctx, op, 0)) {
--    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
++        return true;
--                     MemoryRegion *),
++    }
--#endif
++    return false;
--    DEFINE_PROP_BOOL("start-powered-off", CPUState, start_powered_off, false),
+ }
--    DEFINE_PROP_END_OF_LIST(),
--};
+ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expression for "op r, a, a => movi r, 0" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(andc):
 -        CASE_OP_32_64_VEC(sub):
 -        CASE_OP_32_64_VEC(xor):
 -            if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
- static void cpu_class_init(ObjectClass *klass, void *data)
+         /*
- {
+          * Process each opcode.
-     DeviceClass *dc = DEVICE_CLASS(klass);
+          * Sorted alphabetically by opcode as much as possible.
@@ -XXX,XX +XXX,XX @@ static void cpu_class_init(ObjectClass *klass, void *data)
      dc->realize = cpu_common_realizefn;
      dc->unrealize = cpu_common_unrealizefn;
      dc->reset = cpu_common_reset;
 -    device_class_set_props(dc, cpu_common_props);
 +    cpu_class_init_props(dc);
      /*
       * Reason: CPUs still need special care by board code: wiring up
       * IRQs, adding reset handlers, halting non-first CPUs, ...
 --
 .25.1

-[PULL 15/24] target/openrisc: Drop checks for singlestep_enabled
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
-GDB single-stepping is now handled generically.
+Pull the "op r, a, a => mov r, a" optimization into a function,
 and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/openrisc/translate.c | 18 +++---------------
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
-file changed, 3 insertions(+), 15 deletions(-)
+file changed, 24 insertions(+), 15 deletions(-)
-diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/openrisc/translate.c
+--- a/tcg/optimize.c
-+++ b/target/openrisc/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void openrisc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-             /* The jump destination is indirect/computed; use jmp_pc.  */
+     return false;
-             tcg_gen_mov_tl(cpu_pc, jmp_pc);
+ }
-             tcg_gen_discard_tl(jmp_pc);
--            if (unlikely(dc->base.singlestep_enabled)) {
++/* If the binary operation has both arguments equal, fold to identity. */
--                gen_exception(dc, EXCP_DEBUG);
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
--            } else {
++{
--                tcg_gen_lookup_and_goto_ptr();
++    if (args_are_copies(op->args[1], op->args[2])) {
--            }
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-+            tcg_gen_lookup_and_goto_ptr();
++    }
 +    return false;
 +}
 +
  /*
   * These outermost fold_<op> functions are sorted alphabetically.
 + *
 + * The ordering of the transformations should be:
 + *   1) those that produce a constant
 + *   2) those that produce a copy
 + *   3) those that produce information about the result value.
   */
  static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_x(ctx, op)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xx_to_x(ctx, op)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
-         /* The jump destination is direct; use jmp_pc_imm.
-@@ -XXX,XX +XXX,XX @@ static void openrisc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-             break;
+-        switch (opc) {
-         }
+-        CASE_OP_32_64_VEC(or):
-         tcg_gen_movi_tl(cpu_pc, jmp_dest);
+-        CASE_OP_32_64_VEC(and):
--        if (unlikely(dc->base.singlestep_enabled)) {
+-            if (args_are_copies(op->args[1], op->args[2])) {
--            gen_exception(dc, EXCP_DEBUG);
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
--        } else {
+-                continue;
--            tcg_gen_lookup_and_goto_ptr();
+-            }
 -            break;
 -        default:
 -            break;
 -        }
-+        tcg_gen_lookup_and_goto_ptr();
+-
-         break;
+         /*
+          * Process each opcode.
-     case DISAS_EXIT:
+          * Sorted alphabetically by opcode as much as possible.
 -        if (unlikely(dc->base.singlestep_enabled)) {
 -            gen_exception(dc, EXCP_DEBUG);
 -        } else {
 -            tcg_gen_exit_tb(NULL, 0);
 -        }
 +        tcg_gen_exit_tb(NULL, 0);
          break;
      default:
          g_assert_not_reached();
 --
 .25.1

-[PULL 03/24] target/avr: Drop checks for singlestep_enabled
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
-GDB single-stepping is now handled generically.
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
 and use it in the outer opcode fold functions.
-Tested-by: Michael Rolnik <mrolnik@gmail.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Michael Rolnik <mrolnik@gmail.com>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/avr/translate.c | 19 ++++---------------
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
-file changed, 4 insertions(+), 15 deletions(-)
+file changed, 20 insertions(+), 18 deletions(-)
-diff --git a/target/avr/translate.c b/target/avr/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/avr/translate.c
+--- a/tcg/optimize.c
-+++ b/target/avr/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-         tcg_gen_exit_tb(tb, n);
+     return false;
-     } else {
+ }
-         tcg_gen_movi_i32(cpu_pc, dest);
--        if (ctx->base.singlestep_enabled) {
++/* If the binary operation has second argument @i, fold to @i. */
--            gen_helper_debug(cpu_env);
++static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
--        } else {
++{
--            tcg_gen_lookup_and_goto_ptr();
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
--        }
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
-+        tcg_gen_lookup_and_goto_ptr();
++    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_i(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
-     ctx->base.is_jmp = DISAS_NORETURN;
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
  static bool fold_mul(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
-@@ -XXX,XX +XXX,XX @@ static void avr_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
-         tcg_gen_movi_tl(cpu_pc, ctx->npc);
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
-         /* fall through */
+ {
-     case DISAS_LOOKUP:
+-    return fold_const2(ctx, op);
--        if (!ctx->base.singlestep_enabled) {
++    if (fold_const2(ctx, op) ||
--            tcg_gen_lookup_and_goto_ptr();
++        fold_xi_to_i(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              continue;
          }
 -        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            if (arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
--        /* fall through */
+-
-+        tcg_gen_lookup_and_goto_ptr();
+         /*
-+        break;
+          * Process each opcode.
-     case DISAS_EXIT:
+          * Sorted alphabetically by opcode as much as possible.
 -        if (ctx->base.singlestep_enabled) {
 -            gen_helper_debug(cpu_env);
 -        } else {
 -            tcg_gen_exit_tb(NULL, 0);
 -        }
 +        tcg_gen_exit_tb(NULL, 0);
          break;
      default:
          g_assert_not_reached();
 --
 .25.1

-New patch
+[PULL 38/56] tcg/optimize: Add type to OptContext
+Compute the type of the operation early.
 There are at least 4 places that used a def->flags ladder
 to determine the type of the operation being optimized.
 There were two places that assumed !TCG_OPF_64BIT means
 TCG_TYPE_I32, and so could potentially compute incorrect
 results for vector operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 file changed, 89 insertions(+), 60 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t z_mask;
 +    TCGType type;
  } OptContext;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
 -    const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
      uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      reset_ts(dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_mov_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_mov_i64;
 -    } else {
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
          new_op = INDEX_op_mov_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        new_op = INDEX_op_mov_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
      op->args[0] = dst;
      op->args[1] = src;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      }
  }
 -static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 +static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
 +                                    uint64_t x, uint64_t y)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op];
      uint64_t res = do_constant_folding_2(op, x, y);
 -    if (!(def->flags & TCG_OPF_64BIT)) {
 +    if (type == TCG_TYPE_I32) {
          res = (int32_t)res;
      }
      return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
   * Return -1 if the condition can't be simplified,
   * and the result of the condition (0 or 1) if it can.
   */
 -static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
      if (arg_is_const(x) && arg_is_const(y)) {
 -        const TCGOpDef *def = &tcg_op_defs[op];
 -        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
 -        if (def->flags & TCG_OPF_64BIT) {
 -            return do_constant_folding_cond_64(xv, yv, c);
 -        } else {
 +        switch (type) {
 +        case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
 +        case TCG_TYPE_I64:
 +            return do_constant_folding_cond_64(xv, yv, c);
 +        default:
 +            /* Only scalar comparisons are optimizable */
 +            return -1;
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, 0);
 +        t = do_constant_folding(op->opc, ctx->type, t, 0);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
 -        t1 = do_constant_folding(op->opc, t1, t2);
 +        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
 +        case TCG_TYPE_I64:
 +            opc = INDEX_op_setcond_i64;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          if (tv == 1 && fv == 0) {
              op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
          /* For commutative operations make constant second argument */
          switch (opc) {
          CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Proceed with possible constant folding. */
                      break;
                  }
 -                if (opc == INDEX_op_sub_i32) {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      neg_op = INDEX_op_neg_i32;
                      have_neg = TCG_TARGET_HAS_neg_i32;
 -                } else if (opc == INDEX_op_sub_i64) {
 +                    break;
 +                case TCG_TYPE_I64:
                      neg_op = INDEX_op_neg_i64;
                      have_neg = TCG_TARGET_HAS_neg_i64;
 -                } else if (TCG_TARGET_HAS_neg_vec) {
 -                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -                    unsigned vece = TCGOP_VECE(op);
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
 -                } else {
                      break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    neg_op = INDEX_op_neg_vec;
 +                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 +                                                   TCGOP_VECE(op)) > 0;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_neg) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGOpcode not_op;
                  bool have_not;
 -                if (def->flags & TCG_OPF_VECTOR) {
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-New patch
+[PULL 39/56] tcg/optimize: Split out fold_to_not
+Split out the conditional conversion from a more complex logical
 operation to a simple NOT.  Create a couple more helpers to make
 this easy for the outer-most logical operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 file changed, 86 insertions(+), 72 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +/*
 + * Convert @op to NOT, if NOT is supported by the host.
 + * Return true f the conversion is successful, which will still
 + * indicate that the processing is complete.
 + */
 +static bool fold_not(OptContext *ctx, TCGOp *op);
 +static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
 +{
 +    TCGOpcode not_op;
 +    bool have_not;
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        not_op = INDEX_op_not_i32;
 +        have_not = TCG_TARGET_HAS_not_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        not_op = INDEX_op_not_i64;
 +        have_not = TCG_TARGET_HAS_not_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        not_op = INDEX_op_not_vec;
 +        have_not = TCG_TARGET_HAS_not_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              }
              break;
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(nand):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64(nor):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(andc):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == -1) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        try_not:
 -            {
 -                TCGOpcode not_op;
 -                bool have_not;
 -
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    not_op = INDEX_op_not_i32;
 -                    have_not = TCG_TARGET_HAS_not_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
          }
 --
 .25.1

-[PULL 20/24] target/s390x: Drop check for singlestep_enabled
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
-GDB single-stepping is now handled generically.
+Even though there is only one user, place this more complex
 conversion into its own helper.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/s390x/tcg/translate.c | 8 ++------
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
-file changed, 2 insertions(+), 6 deletions(-)
+file changed, 47 insertions(+), 42 deletions(-)
-diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/s390x/tcg/translate.c
+--- a/tcg/optimize.c
-+++ b/target/s390x/tcg/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct DisasContext {
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
-     uint64_t pc_tmp;
-     uint32_t ilen;
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
-     enum cc_op cc_op;
+ {
--    bool do_debug;
+-    return fold_const1(ctx, op);
- };
++    if (fold_const1(ctx, op)) {
++        return true;
- /* Information carried about a condition to be evaluated.  */
++    }
-@@ -XXX,XX +XXX,XX @@ static void s390x_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
++    /*
++     * Because of fold_sub_to_neg, we want to always return true,
-     dc->cc_op = CC_OP_DYNAMIC;
++     * via finish_folding.
-     dc->ex_value = dc->base.tb->cs_base;
++     */
--    dc->do_debug = dc->base.singlestep_enabled;
++    finish_folding(ctx, op);
 +    return true;
  }
- static void s390x_tr_tb_start(DisasContextBase *db, CPUState *cs)
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void s390x_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
-         /* FALLTHRU */
+     return fold_const2(ctx, op);
-     case DISAS_PC_CC_UPDATED:
+ }
-         /* Exit the TB, either by raising a debug exception or by return.  */
--        if (dc->do_debug) {
++static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
--            gen_exception(EXCP_DEBUG);
++{
--        } else if ((dc->base.tb->flags & FLAG_MASK_PER) ||
++    TCGOpcode neg_op;
--                   dc->base.is_jmp == DISAS_PC_STALE_NOCHAIN) {
++    bool have_neg;
-+        if ((dc->base.tb->flags & FLAG_MASK_PER) ||
++
-+             dc->base.is_jmp == DISAS_PC_STALE_NOCHAIN) {
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
-             tcg_gen_exit_tb(NULL, 0);
++        return false;
-         } else {
++    }
-             tcg_gen_lookup_and_goto_ptr();
++
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        neg_op = INDEX_op_neg_i32;
 +        have_neg = TCG_TARGET_HAS_neg_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        neg_op = INDEX_op_neg_i64;
 +        have_neg = TCG_TARGET_HAS_neg_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        neg_op = INDEX_op_neg_vec;
 +        have_neg = (TCG_TARGET_HAS_neg_vec &&
 +                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_neg) {
 +        op->opc = neg_op;
 +        op->args[1] = op->args[2];
 +        return fold_neg(ctx, op);
 +    }
 +    return false;
 +}
 +
  static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_sub_to_neg(ctx, op)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  continue;
              }
              break;
 -        CASE_OP_32_64_VEC(sub):
 -            {
 -                TCGOpcode neg_op;
 -                bool have_neg;
 -
 -                if (arg_is_const(op->args[2])) {
 -                    /* Proceed with possible constant folding. */
 -                    break;
 -                }
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    neg_op = INDEX_op_neg_i32;
 -                    have_neg = TCG_TARGET_HAS_neg_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    neg_op = INDEX_op_neg_i64;
 -                    have_neg = TCG_TARGET_HAS_neg_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 -                                                   TCGOP_VECE(op)) > 0;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_neg) {
 -                    break;
 -                }
 -                if (arg_is_const(op->args[1])
 -                    && arg_info(op->args[1])->val == 0) {
 -                    op->opc = neg_op;
 -                    reset_temp(op->args[0]);
 -                    op->args[1] = op->args[2];
 -                    continue;
 -                }
 -            }
 -            break;
          default:
              break;
          }
 --
 .25.1

-[PULL 11/24] target/microblaze: Check CF_NO_GOTO_TB for DISAS_JUMP
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
-We were using singlestep_enabled as a proxy for whether
+Pull the "op r, a, i => mov r, a" optimization into a function,
-translator_use_goto_tb would always return false.
+and use them in the outer-most logical operations.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/microblaze/translate.c | 4 ++--
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
-file changed, 2 insertions(+), 2 deletions(-)
+file changed, 26 insertions(+), 35 deletions(-)
-diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/translate.c
+--- a/tcg/optimize.c
-+++ b/target/microblaze/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void mb_tr_tb_stop(DisasContextBase *dcb, CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
-         break;
+     return false;
+ }
-     case DISAS_JUMP:
--        if (dc->jmp_dest != -1 && !cs->singlestep_enabled) {
++/* If the binary operation has second argument @i, fold to identity. */
-+        if (dc->jmp_dest != -1 && !(tb_cflags(dc->base.tb) & CF_NO_GOTO_TB)) {
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
-             /* Direct jump. */
++{
-             tcg_gen_discard_i32(cpu_btarget);
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
-@@ -XXX,XX +XXX,XX @@ static void mb_tr_tb_stop(DisasContextBase *dcb, CPUState *cs)
++    }
-             return;
++    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to NOT. */
  static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, -1) ||
          fold_ix_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_x(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_sub_to_neg(ctx, op)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
--        /* Indirect jump (or direct jump w/ singlestep) */
+-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-+        /* Indirect jump (or direct jump w/ goto_tb disabled) */
+-        switch (opc) {
-         tcg_gen_mov_i32(cpu_pc, cpu_btarget);
+-        CASE_OP_32_64_VEC(add):
-         tcg_gen_discard_i32(cpu_btarget);
+-        CASE_OP_32_64_VEC(sub):
+-        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64_VEC(andc):
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Simplify using known-zero bits. Currently only ops with a single
             output argument is supported. */
          z_mask = -1;
 --
 .25.1

-[PULL 02/24] target/alpha: Drop checks for singlestep_enabled
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
-GDB single-stepping is now handled generically.
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
 and use it in fold_shift.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/alpha/translate.c | 13 +++----------
+ tcg/optimize.c | 28 ++++++++++------------------
-file changed, 3 insertions(+), 10 deletions(-)
+file changed, 10 insertions(+), 18 deletions(-)
-diff --git a/target/alpha/translate.c b/target/alpha/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/alpha/translate.c
+--- a/tcg/optimize.c
-+++ b/target/alpha/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void alpha_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
-         tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+     return false;
-         /* FALLTHRU */
+ }
-     case DISAS_PC_UPDATED:
--        if (!ctx->base.singlestep_enabled) {
++/* If the binary operation has first argument @i, fold to @i. */
--            tcg_gen_lookup_and_goto_ptr();
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has first argument @i, fold to NOT. */
  static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 +        fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
 -           and "sub r, 0, a => neg r, a" case.  */
 -        switch (opc) {
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -            if (arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
--        /* FALLTHRU */
+-
-+        tcg_gen_lookup_and_goto_ptr();
+         /* Simplify using known-zero bits. Currently only ops with a single
-+        break;
+            output argument is supported. */
-     case DISAS_PC_UPDATED_NOCHAIN:
+         z_mask = -1;
 -        if (ctx->base.singlestep_enabled) {
 -            gen_excp_1(EXCP_DEBUG, 0);
 -        } else {
 -            tcg_gen_exit_tb(NULL, 0);
 -        }
 +        tcg_gen_exit_tb(NULL, 0);
          break;
      default:
          g_assert_not_reached();
 --
 .25.1

-[PULL 04/24] target/cris: Drop checks for singlestep_enabled
+[PULL 43/56] tcg/optimize: Split out fold_masks
-GDB single-stepping is now handled generically.
+Move all of the known-zero optimizations into the per-opcode
 functions.  Use fold_masks when there is a possibility of the
 result being determined, and simply set ctx->z_mask otherwise.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/cris/translate.c | 16 ----------------
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
-file changed, 16 deletions(-)
+file changed, 294 insertions(+), 251 deletions(-)
-diff --git a/target/cris/translate.c b/target/cris/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/cris/translate.c
+--- a/tcg/optimize.c
-+++ b/target/cris/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void cris_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      TCGTempSet temps_used;
      /* In flight values from optimization. */
 -    uint64_t z_mask;
 +    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
 +    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
      TCGType type;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_masks(OptContext *ctx, TCGOp *op)
 +{
 +    uint64_t a_mask = ctx->a_mask;
 +    uint64_t z_mask = ctx->z_mask;
 +
 +    /*
 +     * 32-bit ops generate 32-bit results.  For the result is zero test
 +     * below, we can ignore high bits, but for further optimizations we
 +     * need to record that the high bits contain garbage.
 +     */
 +    if (ctx->type == TCG_TYPE_I32) {
 +        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
 +        a_mask &= MAKE_64BIT_MASK(0, 32);
 +        z_mask &= MAKE_64BIT_MASK(0, 32);
 +    }
 +
 +    if (z_mask == 0) {
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
 +    }
 +    if (a_mask == 0) {
 +        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    }
 +    return false;
 +}
 +
  /*
   * Convert @op to NOT, if NOT is supported by the host.
   * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1, z2;
 +
      if (fold_const2(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +    z2 = arg_info(op->args[2])->z_mask;
 +    ctx->z_mask = z1 & z2;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer affected bits from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        ctx->a_mask = z1 & ~z2;
 +    }
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1;
 +
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer anything from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
 +        ctx->a_mask = z1 & ~z2;
 +        z1 &= z2;
 +    }
 +    ctx->z_mask = z1;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask, sign;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
          t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    switch (op->opc) {
 +    case INDEX_op_bswap16_i32:
 +    case INDEX_op_bswap16_i64:
 +        z_mask = bswap16(z_mask);
 +        sign = INT16_MIN;
 +        break;
 +    case INDEX_op_bswap32_i32:
 +    case INDEX_op_bswap32_i64:
 +        z_mask = bswap32(z_mask);
 +        sign = INT32_MIN;
 +        break;
 +    case INDEX_op_bswap64_i64:
 +        z_mask = bswap64(z_mask);
 +        sign = INT64_MIN;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 +    case TCG_BSWAP_OZ:
 +        break;
 +    case TCG_BSWAP_OS:
 +        /* If the sign bit may be 1, force all the bits above to 1. */
 +        if (z_mask & sign) {
 +            z_mask |= sign;
 +        }
 +        break;
 +    default:
 +        /* The high bits are undefined: force all bits above the sign to 1. */
 +        z_mask |= sign << 1;
 +        break;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          }
-     }
+         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
+     }
--    if (unlikely(dc->base.singlestep_enabled)) {
++
--        switch (is_jmp) {
++    switch (ctx->type) {
--        case DISAS_TOO_MANY:
++    case TCG_TYPE_I32:
--        case DISAS_UPDATE_NEXT:
++        z_mask = 31;
--            tcg_gen_movi_tl(env_pc, npc);
++        break;
--            /* fall through */
++    case TCG_TYPE_I64:
--        case DISAS_JUMP:
++        z_mask = 63;
--        case DISAS_UPDATE:
++        break;
--            t_gen_raise_exception(EXCP_DEBUG);
++    default:
--            return;
++        g_assert_not_reached();
 +    }
 +    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +
      return false;
  }
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        ctx->z_mask = 32 | 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        ctx->z_mask = 64 | 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
  }
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          t1 = deposit64(t1, op->args[3], op->args[4], t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
 +
 +    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                            op->args[3], op->args[4],
 +                            arg_info(op->args[2])->z_mask);
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
          t = extract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask, sign;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8s):
 +        sign = INT8_MIN;
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16s):
 +        sign = INT16_MIN;
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_ext_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32s_i64:
 +        sign = INT32_MIN;
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    if (z_mask & sign) {
 +        z_mask |= sign;
 +    } else if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extu(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8u):
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16u):
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_extrl_i64_i32:
 +    case INDEX_op_extu_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32u_i64:
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    case INDEX_op_extrh_i64_i32:
 +        type_change = true;
 +        z_mask >>= 32;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    ctx->z_mask = z_mask;
 +    if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    return fold_masks(ctx, op);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
 +    ctx->z_mask = arg_info(op->args[3])->z_mask
 +                | arg_info(op->args[4])->z_mask;
 +
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
  static bool fold_neg(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
 +
 +    /* Set to 1 all bits to the left of the rightmost.  */
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    ctx->z_mask = -(z_mask & -z_mask);
 +
      /*
       * Because of fold_sub_to_neg, we want to always return true,
       * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 +    MemOp mop = get_memop(oi);
 +    int width = 8 * memop_size(mop);
 +
 +    if (!(mop & MO_SIGN) && width < 64) {
 +        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    }
 +
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
 +
 +    ctx->z_mask = 1;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          op->opc = INDEX_op_setcond_i32;
          break;
      }
 +
 +    ctx->z_mask = 1;
      return false;
   do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 +    int64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
          t = sextract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0 && z_mask >= 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +
 +    if (arg_is_const(op->args[2])) {
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 +                                          arg_info(op->args[1])->z_mask,
 +                                          arg_info(op->args[2])->val);
 +        return fold_masks(ctx, op);
 +    }
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
      return fold_addsub2_i32(ctx, op, false);
  }
 +static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* We can't do any folding with a load, but we can record bits. */
 +    switch (op->opc) {
 +    CASE_OP_32_64(ld8u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        break;
 +    CASE_OP_32_64(ld16u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        break;
 +    case INDEX_op_ld32u_i64:
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t z_mask, partmask, affected, tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def;
          bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify using known-zero bits. Currently only ops with a single
 -           output argument is supported. */
 -        z_mask = -1;
 -        affected = -1;
 -        switch (opc) {
 -        CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext8u):
 -            z_mask = 0xff;
 -            goto and_const;
 -        CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext16u):
 -            z_mask = 0xffff;
 -            goto and_const;
 -        case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_ext32u_i64:
 -            z_mask = 0xffffffffU;
 -            goto and_const;
 -
 -        CASE_OP_32_64(and):
 -            z_mask = arg_info(op->args[2])->z_mask;
 -            if (arg_is_const(op->args[2])) {
 -        and_const:
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            z_mask = arg_info(op->args[1])->z_mask & z_mask;
 -            break;
 -
 -        case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_extu_i32_i64:
 -            /* We do not compute affected as it is a size changing op.  */
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(andc):
 -            /* Known-zeros does not imply known-ones.  Therefore unless
 -               op->args[2] is constant, we can't infer anything from it.  */
 -            if (arg_is_const(op->args[2])) {
 -                z_mask = ~arg_info(op->args[2])->z_mask;
 -                goto and_const;
 -            }
 -            /* But we certainly know nothing outside args[1] may be set. */
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        case INDEX_op_sar_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_sar_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_shr_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_shr_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_extrl_i64_i32:
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -        case INDEX_op_extrh_i64_i32:
 -            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
 -            break;
 -
 -        CASE_OP_32_64(shl):
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                z_mask = arg_info(op->args[1])->z_mask << tmp;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(neg):
 -            /* Set to 1 all bits to the left of the rightmost.  */
 -            z_mask = -(arg_info(op->args[1])->z_mask
 -                       & -arg_info(op->args[1])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(deposit):
 -            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 -                               op->args[3], op->args[4],
 -                               arg_info(op->args[2])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(extract):
 -            z_mask = extract64(arg_info(op->args[1])->z_mask,
 -                               op->args[2], op->args[3]);
 -            if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -        CASE_OP_32_64(sextract):
 -            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 -                                op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(xor):
 -            z_mask = arg_info(op->args[1])->z_mask
 -                   | arg_info(op->args[2])->z_mask;
 -            break;
 -
 -        case INDEX_op_clz_i32:
 -        case INDEX_op_ctz_i32:
 -            z_mask = arg_info(op->args[2])->z_mask | 31;
 -            break;
 -
 -        case INDEX_op_clz_i64:
 -        case INDEX_op_ctz_i64:
 -            z_mask = arg_info(op->args[2])->z_mask | 63;
 -            break;
 -
 -        case INDEX_op_ctpop_i32:
 -            z_mask = 32 | 31;
 -            break;
 -        case INDEX_op_ctpop_i64:
 -            z_mask = 64 | 63;
 -            break;
 -
 -        CASE_OP_32_64(setcond):
 -        case INDEX_op_setcond2_i32:
 -            z_mask = 1;
 -            break;
 -
 -        CASE_OP_32_64(movcond):
 -            z_mask = arg_info(op->args[3])->z_mask
 -                   | arg_info(op->args[4])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(ld8u):
 -            z_mask = 0xff;
 -            break;
 -        CASE_OP_32_64(ld16u):
 -            z_mask = 0xffff;
 -            break;
 -        case INDEX_op_ld32u_i64:
 -            z_mask = 0xffffffffu;
 -            break;
 -
 -        CASE_OP_32_64(qemu_ld):
 -            {
 -                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 -                MemOp mop = get_memop(oi);
 -                if (!(mop & MO_SIGN)) {
 -                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 -                }
 -            }
 -            break;
 -
 -        CASE_OP_32_64(bswap16):
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffff) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap16(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int16_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(16, 48);
 -                break;
 -            }
 -            break;
 -
 -        case INDEX_op_bswap32_i64:
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffffffffu) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap32(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int32_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(32, 32);
 -                break;
 -            }
 -            break;
 -
 -        default:
 -            break;
 -        }
--        g_assert_not_reached();
+-
--    }
+-        /* 32-bit ops generate 32-bit results.  For the result is zero test
--
+-           below, we can ignore high bits, but for further optimizations we
-     switch (is_jmp) {
+-           need to record that the high bits contain garbage.  */
-     case DISAS_TOO_MANY:
+-        partmask = z_mask;
-         gen_goto_tb(dc, 0, npc);
+-        if (ctx.type == TCG_TYPE_I32) {
 -            z_mask |= ~(tcg_target_ulong)0xffffffffu;
 -            partmask &= 0xffffffffu;
 -            affected &= 0xffffffffu;
 -        }
 -        ctx.z_mask = z_mask;
 -
 -        if (partmask == 0) {
 -            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -            continue;
 -        }
 -        if (affected == 0) {
 -            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            continue;
 -        }
 +        /* Assume all bits affected, and no bits known zero. */
 +        ctx.a_mask = -1;
 +        ctx.z_mask = -1;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32u_i64:
 +            done = fold_tcg_ld(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 --
 .25.1

-[PULL 21/24] target/sh4: Drop check for singlestep_enabled
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
-GDB single-stepping is now handled generically.
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
 and muls2_i64.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/sh4/helper.h    |  1 -
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
- target/sh4/op_helper.c |  5 -----
+file changed, 35 insertions(+), 9 deletions(-)
  target/sh4/translate.c | 14 +++-----------
 files changed, 3 insertions(+), 17 deletions(-)
-diff --git a/target/sh4/helper.h b/target/sh4/helper.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/sh4/helper.h
+--- a/tcg/optimize.c
-+++ b/target/sh4/helper.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ DEF_HELPER_1(raise_illegal_instruction, noreturn, env)
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
- DEF_HELPER_1(raise_slot_illegal_instruction, noreturn, env)
+     return false;
  DEF_HELPER_1(raise_fpu_disable, noreturn, env)
  DEF_HELPER_1(raise_slot_fpu_disable, noreturn, env)
 -DEF_HELPER_1(debug, noreturn, env)
  DEF_HELPER_1(sleep, noreturn, env)
  DEF_HELPER_2(trapa, noreturn, env, i32)
  DEF_HELPER_1(exclusive, noreturn, env)
 diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/sh4/op_helper.c
 +++ b/target/sh4/op_helper.c
@@ -XXX,XX +XXX,XX @@ void helper_raise_slot_fpu_disable(CPUSH4State *env)
      raise_exception(env, 0x820, 0);
  }
--void helper_debug(CPUSH4State *env)
+-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
--{
++static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 -    raise_exception(env, EXCP_DEBUG, 0);
 -}
 -
  void helper_sleep(CPUSH4State *env)
  {
-     CPUState *cs = env_cpu(env);
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-diff --git a/target/sh4/translate.c b/target/sh4/translate.c
+-        uint32_t a = arg_info(op->args[2])->val;
-index XXXXXXX..XXXXXXX 100644
+-        uint32_t b = arg_info(op->args[3])->val;
---- a/target/sh4/translate.c
+-        uint64_t r = (uint64_t)a * b;
-+++ b/target/sh4/translate.c
++        uint64_t a = arg_info(op->args[2])->val;
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
++        uint64_t b = arg_info(op->args[3])->val;
-         tcg_gen_exit_tb(ctx->base.tb, n);
++        uint64_t h, l;
-     } else {
+         TCGArg rl, rh;
-         tcg_gen_movi_i32(cpu_pc, dest);
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
--        if (ctx->base.singlestep_enabled) {
++        TCGOp *op2;
--            gen_helper_debug(cpu_env);
++
--        } else if (use_exit_tb(ctx)) {
++        switch (op->opc) {
-+        if (use_exit_tb(ctx)) {
++        case INDEX_op_mulu2_i32:
-             tcg_gen_exit_tb(NULL, 0);
++            l = (uint64_t)(uint32_t)a * (uint32_t)b;
-         } else {
++            h = (int32_t)(l >> 32);
-             tcg_gen_lookup_and_goto_ptr();
++            l = (int32_t)l;
-@@ -XXX,XX +XXX,XX @@ static void gen_jump(DisasContext * ctx)
++            break;
-        delayed jump as immediate jump are conditinal jumps */
++        case INDEX_op_muls2_i32:
-     tcg_gen_mov_i32(cpu_pc, cpu_delayed_pc);
++            l = (int64_t)(int32_t)a * (int32_t)b;
-         tcg_gen_discard_i32(cpu_delayed_pc);
++            h = l >> 32;
--        if (ctx->base.singlestep_enabled) {
++            l = (int32_t)l;
--            gen_helper_debug(cpu_env);
++            break;
--        } else if (use_exit_tb(ctx)) {
++        case INDEX_op_mulu2_i64:
-+        if (use_exit_tb(ctx)) {
++            mulu64(&l, &h, a, b);
-             tcg_gen_exit_tb(NULL, 0);
++            break;
-         } else {
++        case INDEX_op_muls2_i64:
-             tcg_gen_lookup_and_goto_ptr();
++            muls64(&l, &h, a, b);
-@@ -XXX,XX +XXX,XX @@ static void sh4_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
++            break;
-     switch (ctx->base.is_jmp) {
++        default:
-     case DISAS_STOP:
++            g_assert_not_reached();
-         gen_save_cpu_state(ctx, true);
++        }
--        if (ctx->base.singlestep_enabled) {
--            gen_helper_debug(cpu_env);
+         rl = op->args[0];
--        } else {
+         rh = op->args[1];
--            tcg_gen_exit_tb(NULL, 0);
+-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
--        }
+-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
-+        tcg_gen_exit_tb(NULL, 0);
++
-         break;
++        /* The proper opcode is supplied by tcg_opt_gen_mov. */
-     case DISAS_NEXT:
++        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
-     case DISAS_TOO_MANY:
++
 +        tcg_opt_gen_movi(ctx, op, rl, l);
 +        tcg_opt_gen_movi(ctx, op2, rh, h);
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(muluh):
              done = fold_mul_highpart(&ctx, op);
              break;
 -        case INDEX_op_mulu2_i32:
 -            done = fold_mulu2_i32(&ctx, op);
 +        CASE_OP_32_64(muls2):
 +        CASE_OP_32_64(mulu2):
 +            done = fold_multiply2(&ctx, op);
              break;
          CASE_OP_32_64(nand):
              done = fold_nand(&ctx, op);
 --
 .25.1

-[PULL 22/24] target/tricore: Drop check for singlestep_enabled
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-GDB single-stepping is now handled generically.
+Rename to fold_addsub2.
 Use Int128 to implement the wider operation.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/tricore/helper.h    |  1 -
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
- target/tricore/op_helper.c |  7 -------
+file changed, 44 insertions(+), 21 deletions(-)
  target/tricore/translate.c | 14 +-------------
 files changed, 1 insertion(+), 21 deletions(-)
-diff --git a/target/tricore/helper.h b/target/tricore/helper.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/tricore/helper.h
+--- a/tcg/optimize.c
-+++ b/target/tricore/helper.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ DEF_HELPER_2(psw_write, void, env, i32)
+@@ -XXX,XX +XXX,XX @@
- DEF_HELPER_1(psw_read, i32, env)
+  */
- /* Exceptions */
- DEF_HELPER_3(raise_exception_sync, noreturn, env, i32, i32)
+ #include "qemu/osdep.h"
--DEF_HELPER_2(qemu_excp, noreturn, env, i32)
++#include "qemu/int128.h"
-diff --git a/target/tricore/op_helper.c b/target/tricore/op_helper.c
+ #include "tcg/tcg-op.h"
-index XXXXXXX..XXXXXXX 100644
+ #include "tcg-internal.h"
---- a/target/tricore/op_helper.c
-+++ b/target/tricore/op_helper.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void raise_exception_sync_helper(CPUTriCoreState *env, uint32_t class,
+     return false;
      raise_exception_sync_internal(env, class, tin, pc, 0);
  }
--void helper_qemu_excp(CPUTriCoreState *env, uint32_t excp)
+-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
--{
++static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
--    CPUState *cs = env_cpu(env);
+ {
--    cs->exception_index = excp;
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
--    cpu_loop_exit(cs);
+         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
--}
+-        uint32_t al = arg_info(op->args[2])->val;
--
+-        uint32_t ah = arg_info(op->args[3])->val;
- /* Addressing mode helper */
+-        uint32_t bl = arg_info(op->args[4])->val;
+-        uint32_t bh = arg_info(op->args[5])->val;
- static uint16_t reverse16(uint16_t val)
+-        uint64_t a = ((uint64_t)ah << 32) | al;
-diff --git a/target/tricore/translate.c b/target/tricore/translate.c
+-        uint64_t b = ((uint64_t)bh << 32) | bl;
-index XXXXXXX..XXXXXXX 100644
++        uint64_t al = arg_info(op->args[2])->val;
---- a/target/tricore/translate.c
++        uint64_t ah = arg_info(op->args[3])->val;
-+++ b/target/tricore/translate.c
++        uint64_t bl = arg_info(op->args[4])->val;
-@@ -XXX,XX +XXX,XX @@ static inline void gen_save_pc(target_ulong pc)
++        uint64_t bh = arg_info(op->args[5])->val;
-     tcg_gen_movi_tl(cpu_PC, pc);
+         TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 -        if (add) {
 -            a += b;
 +        if (ctx->type == TCG_TYPE_I32) {
 +            uint64_t a = deposit64(al, 32, 32, ah);
 +            uint64_t b = deposit64(bl, 32, 32, bh);
 +
 +            if (add) {
 +                a += b;
 +            } else {
 +                a -= b;
 +            }
 +
 +            al = sextract64(a, 0, 32);
 +            ah = sextract64(a, 32, 32);
          } else {
 -            a -= b;
 +            Int128 a = int128_make128(al, ah);
 +            Int128 b = int128_make128(bl, bh);
 +
 +            if (add) {
 +                a = int128_add(a, b);
 +            } else {
 +                a = int128_sub(a, b);
 +            }
 +
 +            al = int128_getlo(a);
 +            ah = int128_gethi(a);
          }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, al);
 +        tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
      return false;
  }
--static void generate_qemu_excp(DisasContext *ctx, int excp)
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
--{
++static bool fold_add2(OptContext *ctx, TCGOp *op)
 -    TCGv_i32 tmp = tcg_const_i32(excp);
 -    gen_helper_qemu_excp(cpu_env, tmp);
 -    ctx->base.is_jmp = DISAS_NORETURN;
 -    tcg_temp_free(tmp);
 -}
 -
  static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
  {
-     if (translator_use_goto_tb(&ctx->base, dest)) {
+-    return fold_addsub2_i32(ctx, op, true);
-@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
++    return fold_addsub2(ctx, op, true);
          tcg_gen_exit_tb(ctx->base.tb, n);
      } else {
          gen_save_pc(dest);
 -        if (ctx->base.singlestep_enabled) {
 -            generate_qemu_excp(ctx, EXCP_DEBUG);
 -        } else {
 -            tcg_gen_lookup_and_goto_ptr();
 -        }
 +        tcg_gen_lookup_and_goto_ptr();
      }
  }
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return false;
+ }
+-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++static bool fold_sub2(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_addsub2_i32(ctx, op, false);
++    return fold_addsub2(ctx, op, false);
+ }
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
+-        case INDEX_op_add2_i32:
+-            done = fold_add2_i32(&ctx, op);
++        CASE_OP_32_64(add2):
++            done = fold_add2(&ctx, op);
+             break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+-        case INDEX_op_sub2_i32:
+-            done = fold_sub2_i32(&ctx, op);
++        CASE_OP_32_64(sub2):
++            done = fold_sub2(&ctx, op);
+             break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
 --
 .25.1

-[PULL 09/24] target/i386: Drop check for singlestep_enabled
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
-GDB single-stepping is now handled generically.
+Most of these are handled by creating a fold_const2_commutative
+to handle all of the binary operators.  The rest were already
 handled on a case-by-case basis in the switch, and have their
 own fold function in which to place the call.
 We now have only one major switch on TCGOpcode.
 Introduce NO_DEST and a block comment for swap_commutative in
 order to make the handling of brcond and movcond opcodes cleaner.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/i386/helper.h          | 1 -
+ tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
- target/i386/tcg/misc_helper.c | 8 --------
+file changed, 70 insertions(+), 72 deletions(-)
- target/i386/tcg/translate.c   | 4 +---
-files changed, 1 insertion(+), 12 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 diff --git a/target/i386/helper.h b/target/i386/helper.h
 index XXXXXXX..XXXXXXX 100644
---- a/target/i386/helper.h
+--- a/tcg/optimize.c
-+++ b/target/i386/helper.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ DEF_HELPER_2(syscall, void, env, int)
+@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
- DEF_HELPER_2(sysret, void, env, int)
+     return -1;
  #endif
  DEF_HELPER_FLAGS_2(pause, TCG_CALL_NO_WG, noreturn, env, int)
 -DEF_HELPER_FLAGS_1(debug, TCG_CALL_NO_WG, noreturn, env)
  DEF_HELPER_1(reset_rf, void, env)
  DEF_HELPER_FLAGS_3(raise_interrupt, TCG_CALL_NO_WG, noreturn, env, int, int)
  DEF_HELPER_FLAGS_2(raise_exception, TCG_CALL_NO_WG, noreturn, env, int)
 diff --git a/target/i386/tcg/misc_helper.c b/target/i386/tcg/misc_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/misc_helper.c
 +++ b/target/i386/tcg/misc_helper.c
@@ -XXX,XX +XXX,XX @@ void QEMU_NORETURN helper_pause(CPUX86State *env, int next_eip_addend)
      do_pause(env);
  }
--void QEMU_NORETURN helper_debug(CPUX86State *env)
++/**
--{
++ * swap_commutative:
--    CPUState *cs = env_cpu(env);
++ * @dest: TCGArg of the destination argument, or NO_DEST.
 + * @p1: first paired argument
 + * @p2: second paired argument
 + *
 + * If *@p1 is a constant and *@p2 is not, swap.
 + * If *@p2 matches @dest, swap.
 + * Return true if a swap was performed.
 + */
 +
 +#define NO_DEST  temp_arg(NULL)
 +
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
  {
      TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 +{
 +    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
 +    /* Note that the high and low parts may be independently swapped. */
 +    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 +    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 +
      return fold_addsub2(ctx, op, true);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      uint64_t z1, z2;
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
 +        op->args[2] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* For commutative operations make constant second argument */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 -            break;
 -        CASE_OP_32_64(brcond):
 -            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
 -                op->args[2] = tcg_swap_cond(op->args[2]);
 -            }
 -            break;
 -        CASE_OP_32_64(setcond):
 -            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 -                op->args[3] = tcg_swap_cond(op->args[3]);
 -            }
 -            break;
 -        CASE_OP_32_64(movcond):
 -            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            /* For movcond, we canonicalize the "false" input reg to match
 -               the destination reg so that the tcg backend can implement
 -               a "move if true" operation.  */
 -            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -                op->args[5] = tcg_invert_cond(op->args[5]);
 -            }
 -            break;
 -        CASE_OP_32_64(add2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 -            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 -            break;
 -        CASE_OP_32_64(mulu2):
 -        CASE_OP_32_64(muls2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 -            break;
 -        case INDEX_op_brcond2_i32:
 -            if (swap_commutative2(&op->args[0], &op->args[2])) {
 -                op->args[4] = tcg_swap_cond(op->args[4]);
 -            }
 -            break;
 -        case INDEX_op_setcond2_i32:
 -            if (swap_commutative2(&op->args[1], &op->args[3])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
--    cs->exception_index = EXCP_DEBUG;
+         /* Assume all bits affected, and no bits known zero. */
--    cpu_loop_exit(cs);
+         ctx.a_mask = -1;
--}
+         ctx.z_mask = -1;
 -
  uint64_t helper_rdpkru(CPUX86State *env, uint32_t ecx)
  {
      if ((env->cr[4] & CR4_PKE_MASK) == 0) {
 diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/i386/tcg/translate.c
 +++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ do_gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, bool jr)
      if (s->base.tb->flags & HF_RF_MASK) {
          gen_helper_reset_rf(cpu_env);
      }
 -    if (s->base.singlestep_enabled) {
 -        gen_helper_debug(cpu_env);
 -    } else if (recheck_tf) {
 +    if (recheck_tf) {
          gen_helper_rechecking_single_step(cpu_env);
          tcg_gen_exit_tb(NULL, 0);
      } else if (s->flags & HF_TF_MASK) {
 --
 .25.1

-New patch
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
+This "garbage" setting pre-dates the addition of the type
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
+and INDEX_op_extr{l,h}_i64_i32.
+So now we have a definitive points at which to adjust z_mask
+to eliminate such bits from the 32-bit operands.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
+file changed, 16 insertions(+), 19 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+         ti->is_const = true;
+         ti->val = ts->val;
+         ti->z_mask = ts->val;
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
+-            /* High bits of a 32-bit quantity are garbage.  */
+-            ti->z_mask |= ~0xffffffffull;
+-        }
+     } else {
+         ti->is_const = false;
+         ti->z_mask = -1;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+     TCGTemp *src_ts = arg_temp(src);
+     TempOptInfo *di;
+     TempOptInfo *si;
+-    uint64_t z_mask;
+     TCGOpcode new_op;
+     if (ts_are_copies(dst_ts, src_ts)) {
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+     op->args[0] = dst;
+     op->args[1] = src;
+-    z_mask = si->z_mask;
+-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
+-        /* High bits of the destination are now garbage.  */
+-        z_mask |= ~0xffffffffull;
+-    }
+-    di->z_mask = z_mask;
++    di->z_mask = si->z_mask;
+     if (src_ts->type == dst_ts->type) {
+         TempOptInfo *ni = ts_info(si->next_copy);
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                              TCGArg dst, uint64_t val)
+ {
+-    /* Convert movi to mov with constant temp. */
+-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
++    TCGTemp *tv;
++    if (ctx->type == TCG_TYPE_I32) {
++        val = (int32_t)val;
++    }
++
++    /* Convert movi to mov with constant temp. */
++    tv = tcg_constant_internal(ctx->type, val);
+     init_ts_info(ctx, tv);
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
+     uint64_t z_mask = ctx->z_mask;
+     /*
+-     * 32-bit ops generate 32-bit results.  For the result is zero test
+-     * below, we can ignore high bits, but for further optimizations we
+-     * need to record that the high bits contain garbage.
++     * 32-bit ops generate 32-bit results, which for the purpose of
++     * simplifying tcg are sign-extended.  Certainly that's how we
++     * represent our constants elsewhere.  Note that the bits will
++     * be reset properly for a 64-bit value when encountering the
++     * type changing opcodes.
+      */
+     if (ctx->type == TCG_TYPE_I32) {
+-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+-        a_mask &= MAKE_64BIT_MASK(0, 32);
+-        z_mask &= MAKE_64BIT_MASK(0, 32);
++        a_mask = (int32_t)a_mask;
++        z_mask = (int32_t)z_mask;
++        ctx->z_mask = z_mask;
+     }
+     if (z_mask == 0) {
+--
+.25.1

-[PULL 12/24] target/microblaze: Drop checks for singlestep_enabled
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
-GDB single-stepping is now handled generically.
+Recognize the constant function for or-complement.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/microblaze/translate.c | 14 ++------------
+ tcg/optimize.c | 1 +
-file changed, 2 insertions(+), 12 deletions(-)
+file changed, 1 insertion(+)
-diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/microblaze/translate.c
+--- a/tcg/optimize.c
-+++ b/target/microblaze/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void gen_raise_hw_excp(DisasContext *dc, uint32_t esr_ec)
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
  {
--    if (dc->base.singlestep_enabled) {
+     if (fold_const2(ctx, op) ||
--        TCGv_i32 tmp = tcg_const_i32(EXCP_DEBUG);
++        fold_xx_to_i(ctx, op, -1) ||
--        tcg_gen_movi_i32(cpu_pc, dest);
+         fold_xi_to_x(ctx, op, -1) ||
--        gen_helper_raise_exception(cpu_env, tmp);
+         fold_ix_to_not(ctx, op, 0)) {
--        tcg_temp_free_i32(tmp);
+         return true;
 -    } else if (translator_use_goto_tb(&dc->base, dest)) {
 +    if (translator_use_goto_tb(&dc->base, dest)) {
          tcg_gen_goto_tb(n);
          tcg_gen_movi_i32(cpu_pc, dest);
          tcg_gen_exit_tb(dc->base.tb, n);
@@ -XXX,XX +XXX,XX @@ static void mb_tr_tb_stop(DisasContextBase *dcb, CPUState *cs)
          /* Indirect jump (or direct jump w/ goto_tb disabled) */
          tcg_gen_mov_i32(cpu_pc, cpu_btarget);
          tcg_gen_discard_i32(cpu_btarget);
 -
 -        if (unlikely(cs->singlestep_enabled)) {
 -            gen_raise_exception(dc, EXCP_DEBUG);
 -        } else {
 -            tcg_gen_lookup_and_goto_ptr();
 -        }
 +        tcg_gen_lookup_and_goto_ptr();
          return;
      default:
 --
 .25.1

-[PULL 08/24] target/i386: Check CF_NO_GOTO_TB for dc->jmp_opt
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
-We were using singlestep_enabled as a proxy for whether
+Recognize the identity function for low-part multiply.
 translator_use_goto_tb would always return false.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/i386/tcg/translate.c | 5 +++--
+ tcg/optimize.c | 3 ++-
-file changed, 3 insertions(+), 2 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/i386/tcg/translate.c
+--- a/tcg/optimize.c
-+++ b/target/i386/tcg/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-     DisasContext *dc = container_of(dcbase, DisasContext, base);
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
-     CPUX86State *env = cpu->env_ptr;
+ {
-     uint32_t flags = dc->base.tb->flags;
+     if (fold_const2(ctx, op) ||
-+    uint32_t cflags = tb_cflags(dc->base.tb);
+-        fold_xi_to_i(ctx, op, 0)) {
-     int cpl = (flags >> HF_CPL_SHIFT) & 3;
++        fold_xi_to_i(ctx, op, 0) ||
-     int iopl = (flags >> IOPL_SHIFT) & 3;
++        fold_xi_to_x(ctx, op, 1)) {
+         return true;
-@@ -XXX,XX +XXX,XX @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
+     }
-     dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
+     return false;
      dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
      dc->cpuid_xsave_features = env->features[FEAT_XSAVE];
 -    dc->jmp_opt = !(dc->base.singlestep_enabled ||
 +    dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) ||
                      (flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));
      /*
       * If jmp_opt, we want to handle each string instruction individually.
       * For icount also disable repz optimization so that each iteration
       * is accounted separately.
       */
 -    dc->repz_opt = !dc->jmp_opt && !(tb_cflags(dc->base.tb) & CF_USE_ICOUNT);
 +    dc->repz_opt = !dc->jmp_opt && !(cflags & CF_USE_ICOUNT);
      dc->T0 = tcg_temp_new();
      dc->T1 = tcg_temp_new();
 --
 .25.1

-[PULL 19/24] target/rx: Drop checks for singlestep_enabled
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
-GDB single-stepping is now handled generically.
+Recognize the identity function for division.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/rx/helper.h    |  1 -
+ tcg/optimize.c | 6 +++++-
- target/rx/op_helper.c |  8 --------
+file changed, 5 insertions(+), 1 deletion(-)
  target/rx/translate.c | 12 ++----------
 files changed, 2 insertions(+), 19 deletions(-)
-diff --git a/target/rx/helper.h b/target/rx/helper.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/rx/helper.h
+--- a/tcg/optimize.c
-+++ b/target/rx/helper.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ DEF_HELPER_1(raise_illegal_instruction, noreturn, env)
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
- DEF_HELPER_1(raise_access_fault, noreturn, env)
- DEF_HELPER_1(raise_privilege_violation, noreturn, env)
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
- DEF_HELPER_1(wait, noreturn, env)
+ {
--DEF_HELPER_1(debug, noreturn, env)
+-    return fold_const2(ctx, op);
- DEF_HELPER_2(rxint, noreturn, env, i32)
++    if (fold_const2(ctx, op) ||
- DEF_HELPER_1(rxbrk, noreturn, env)
++        fold_xi_to_x(ctx, op, 1)) {
- DEF_HELPER_FLAGS_3(fadd, TCG_CALL_NO_WG, f32, env, f32, f32)
++        return true;
-diff --git a/target/rx/op_helper.c b/target/rx/op_helper.c
++    }
-index XXXXXXX..XXXXXXX 100644
++    return false;
 --- a/target/rx/op_helper.c
 +++ b/target/rx/op_helper.c
@@ -XXX,XX +XXX,XX @@ void QEMU_NORETURN helper_wait(CPURXState *env)
      raise_exception(env, EXCP_HLT, 0);
  }
--void QEMU_NORETURN helper_debug(CPURXState *env)
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
 -{
 -    CPUState *cs = env_cpu(env);
 -
 -    cs->exception_index = EXCP_DEBUG;
 -    cpu_loop_exit(cs);
 -}
 -
  void QEMU_NORETURN helper_rxint(CPURXState *env, uint32_t vec)
  {
      raise_exception(env, 0x100 + vec, 0);
 diff --git a/target/rx/translate.c b/target/rx/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/rx/translate.c
 +++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
          tcg_gen_exit_tb(dc->base.tb, n);
      } else {
          tcg_gen_movi_i32(cpu_pc, dest);
 -        if (dc->base.singlestep_enabled) {
 -            gen_helper_debug(cpu_env);
 -        } else {
 -            tcg_gen_lookup_and_goto_ptr();
 -        }
 +        tcg_gen_lookup_and_goto_ptr();
      }
      dc->base.is_jmp = DISAS_NORETURN;
  }
@@ -XXX,XX +XXX,XX @@ static void rx_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
          gen_goto_tb(ctx, 0, dcbase->pc_next);
          break;
      case DISAS_JUMP:
 -        if (ctx->base.singlestep_enabled) {
 -            gen_helper_debug(cpu_env);
 -        } else {
 -            tcg_gen_lookup_and_goto_ptr();
 -        }
 +        tcg_gen_lookup_and_goto_ptr();
          break;
      case DISAS_UPDATE:
          tcg_gen_movi_i32(cpu_pc, ctx->base.pc_next);
 --
 .25.1

-[PULL 05/24] target/hexagon: Drop checks for singlestep_enabled
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-GDB single-stepping is now handled generically.
+Recognize the constant function for remainder.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/hexagon/translate.c | 12 ++----------
+ tcg/optimize.c | 6 +++++-
-file changed, 2 insertions(+), 10 deletions(-)
+file changed, 5 insertions(+), 1 deletion(-)
-diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/hexagon/translate.c
+--- a/tcg/optimize.c
-+++ b/target/hexagon/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void gen_end_tb(DisasContext *ctx)
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
  static bool fold_remainder(OptContext *ctx, TCGOp *op)
  {
-     gen_exec_counters(ctx);
+-    return fold_const2(ctx, op);
-     tcg_gen_mov_tl(hex_gpr[HEX_REG_PC], hex_next_PC);
++    if (fold_const2(ctx, op) ||
--    if (ctx->base.singlestep_enabled) {
++        fold_xx_to_i(ctx, op, 0)) {
--        gen_exception_raw(EXCP_DEBUG);
++        return true;
--    } else {
++    }
--        tcg_gen_exit_tb(NULL, 0);
++    return false;
 -    }
 +    tcg_gen_exit_tb(NULL, 0);
      ctx->base.is_jmp = DISAS_NORETURN;
  }
-@@ -XXX,XX +XXX,XX @@ static void hexagon_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      case DISAS_TOO_MANY:
          gen_exec_counters(ctx);
          tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->base.pc_next);
 -        if (ctx->base.singlestep_enabled) {
 -            gen_exception_raw(EXCP_DEBUG);
 -        } else {
 -            tcg_gen_exit_tb(NULL, 0);
 -        }
 +        tcg_gen_exit_tb(NULL, 0);
          break;
      case DISAS_NORETURN:
          break;
 --
 .25.1

-[PULL 23/24] target/xtensa: Drop check for singlestep_enabled
+[PULL 52/56] tcg/optimize: Optimize sign extensions
-GDB single-stepping is now handled generically.
+Certain targets, like riscv, produce signed 32-bit results.
+This can lead to lots of redundant extensions as values are
 manipulated.
 Begin by tracking only the obvious sign-extensions, and
 converting them to simple copies when possible.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/xtensa/translate.c | 25 ++++++++-----------------
+ tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
-file changed, 8 insertions(+), 17 deletions(-)
+file changed, 102 insertions(+), 21 deletions(-)
-diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/xtensa/translate.c
+--- a/tcg/optimize.c
-+++ b/target/xtensa/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static void gen_jump_slot(DisasContext *dc, TCGv dest, int slot)
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-     if (dc->icount) {
+     TCGTemp *next_copy;
-         tcg_gen_mov_i32(cpu_SR[ICOUNT], dc->next_icount);
+     uint64_t val;
-     }
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
--    if (dc->base.singlestep_enabled) {
++    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
--        gen_exception(dc, EXCP_DEBUG);
+ } TempOptInfo;
-+    if (dc->op_flags & XTENSA_OP_POSTPROCESS) {
-+        slot = gen_postprocess(dc, slot);
+ typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
      uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
 +    uint64_t s_mask;  /* mask of clrsb(value) bits */
      TCGType type;
  } OptContext;
 +/* Calculate the smask for a specific value. */
 +static uint64_t smask_from_value(uint64_t value)
 +{
 +    int rep = clrsb64(value);
 +    return ~(~0ull >> rep);
 +}
 +
 +/*
 + * Calculate the smask for a given set of known-zeros.
 + * If there are lots of zeros on the left, we can consider the remainder
 + * an unsigned field, and thus the corresponding signed field is one bit
 + * larger.
 + */
 +static uint64_t smask_from_zmask(uint64_t zmask)
 +{
 +    /*
 +     * Only the 0 bits are significant for zmask, thus the msb itself
 +     * must be zero, else we have no sign information.
 +     */
 +    int rep = clz64(zmask);
 +    if (rep == 0) {
 +        return 0;
 +    }
-+    if (slot >= 0) {
++    rep -= 1;
-+        tcg_gen_goto_tb(slot);
++    return ~(~0ull >> rep);
-+        tcg_gen_exit_tb(dc->base.tb, slot);
++}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->prev_copy = ts;
      ti->is_const = false;
      ti->z_mask = -1;
 +    ti->s_mask = 0;
  }
  static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
 +        ti->s_mask = smask_from_value(ts->val);
      } else {
--        if (dc->op_flags & XTENSA_OP_POSTPROCESS) {
+         ti->is_const = false;
--            slot = gen_postprocess(dc, slot);
+         ti->z_mask = -1;
--        }
++        ti->s_mask = 0;
--        if (slot >= 0) {
+     }
--            tcg_gen_goto_tb(slot);
+ }
--            tcg_gen_exit_tb(dc->base.tb, slot);
--        } else {
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
--            tcg_gen_exit_tb(NULL, 0);
+     op->args[1] = src;
--        }
-+        tcg_gen_exit_tb(NULL, 0);
+     di->z_mask = si->z_mask;
-     }
++    di->s_mask = si->s_mask;
-     dc->base.is_jmp = DISAS_NORETURN;
- }
+     if (src_ts->type == dst_ts->type) {
-@@ -XXX,XX +XXX,XX @@ static void xtensa_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+         TempOptInfo *ni = ts_info(si->next_copy);
-     case DISAS_NORETURN:
+@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
-         break;
-     case DISAS_TOO_MANY:
+     nb_oargs = def->nb_oargs;
--        if (dc->base.singlestep_enabled) {
+     for (i = 0; i < nb_oargs; i++) {
--            tcg_gen_movi_i32(cpu_pc, dc->pc);
+-        reset_temp(op->args[i]);
--            gen_exception(dc, EXCP_DEBUG);
++        TCGTemp *ts = arg_temp(op->args[i]);
--        } else {
++        reset_ts(ts);
--            gen_jumpi(dc, dc->pc, 0);
+         /*
--        }
+-         * Save the corresponding known-zero bits mask for the
-+        gen_jumpi(dc, dc->pc, 0);
++         * Save the corresponding known-zero/sign bits mask for the
-         break;
+          * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
 +    uint64_t s_mask = ctx->s_mask;
      /*
       * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      if (ctx->type == TCG_TYPE_I32) {
          a_mask = (int32_t)a_mask;
          z_mask = (int32_t)z_mask;
 +        s_mask |= MAKE_64BIT_MASK(32, 32);
          ctx->z_mask = z_mask;
 +        ctx->s_mask = s_mask;
      }
      if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask, sign;
 +    uint64_t z_mask, s_mask, sign;
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
      z_mask = arg_info(op->args[1])->z_mask;
 +
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
+     }
++    s_mask = smask_from_zmask(z_mask);
+     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+     case TCG_BSWAP_OZ:
+@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
+         /* If the sign bit may be 1, force all the bits above to 1. */
+         if (z_mask & sign) {
+             z_mask |= sign;
++            s_mask = sign << 1;
+         }
+         break;
+     default:
+         /* The high bits are undefined: force all bits above the sign to 1. */
+         z_mask |= sign << 1;
++        s_mask = 0;
+         break;
+     }
+     ctx->z_mask = z_mask;
++    ctx->s_mask = s_mask;
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ static bool fold_extract(OptContext *ctx, TCGOp *op)
+ {
+     uint64_t z_mask_old, z_mask;
++    int pos = op->args[2];
++    int len = op->args[3];
+     if (arg_is_const(op->args[1])) {
+         uint64_t t;
+         t = arg_info(op->args[1])->val;
+-        t = extract64(t, op->args[2], op->args[3]);
++        t = extract64(t, pos, len);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+     z_mask_old = arg_info(op->args[1])->z_mask;
+-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+-    if (op->args[2] == 0) {
++    z_mask = extract64(z_mask_old, pos, len);
++    if (pos == 0) {
+         ctx->a_mask = z_mask_old ^ z_mask;
+     }
+     ctx->z_mask = z_mask;
++    ctx->s_mask = smask_from_zmask(z_mask);
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+-    uint64_t z_mask_old, z_mask, sign;
++    uint64_t s_mask_old, s_mask, z_mask, sign;
+     bool type_change = false;
+     if (fold_const1(ctx, op)) {
+         return true;
+     }
+-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
++    z_mask = arg_info(op->args[1])->z_mask;
++    s_mask = arg_info(op->args[1])->s_mask;
++    s_mask_old = s_mask;
+     switch (op->opc) {
+     CASE_OP_32_64(ext8s):
+@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
+     if (z_mask & sign) {
+         z_mask |= sign;
+-    } else if (!type_change) {
+-        ctx->a_mask = z_mask_old ^ z_mask;
+     }
++    s_mask |= sign << 1;
++
+     ctx->z_mask = z_mask;
++    ctx->s_mask = s_mask;
++    if (!type_change) {
++        ctx->a_mask = s_mask & ~s_mask_old;
++    }
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
+     }
+     ctx->z_mask = z_mask;
++    ctx->s_mask = smask_from_zmask(z_mask);
+     if (!type_change) {
+         ctx->a_mask = z_mask_old ^ z_mask;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+     MemOp mop = get_memop(oi);
+     int width = 8 * memop_size(mop);
+-    if (!(mop & MO_SIGN) && width < 64) {
+-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
++    if (width < 64) {
++        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
++        if (!(mop & MO_SIGN)) {
++            ctx->z_mask = MAKE_64BIT_MASK(0, width);
++            ctx->s_mask <<= 1;
++        }
+     }
+     /* Opcodes that touch guest memory stop the mb optimization.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ {
+-    int64_t z_mask_old, z_mask;
++    uint64_t z_mask, s_mask, s_mask_old;
++    int pos = op->args[2];
++    int len = op->args[3];
+     if (arg_is_const(op->args[1])) {
+         uint64_t t;
+         t = arg_info(op->args[1])->val;
+-        t = sextract64(t, op->args[2], op->args[3]);
++        t = sextract64(t, pos, len);
+         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+     }
+-    z_mask_old = arg_info(op->args[1])->z_mask;
+-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+-    if (op->args[2] == 0 && z_mask >= 0) {
+-        ctx->a_mask = z_mask_old ^ z_mask;
+-    }
++    z_mask = arg_info(op->args[1])->z_mask;
++    z_mask = sextract64(z_mask, pos, len);
+     ctx->z_mask = z_mask;
++    s_mask_old = arg_info(op->args[1])->s_mask;
++    s_mask = sextract64(s_mask_old, pos, len);
++    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
++    ctx->s_mask = s_mask;
++
++    if (pos == 0) {
++        ctx->a_mask = s_mask & ~s_mask_old;
++    }
++
+     return fold_masks(ctx, op);
+ }
+@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+ {
+     /* We can't do any folding with a load, but we can record bits. */
+     switch (op->opc) {
++    CASE_OP_32_64(ld8s):
++        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
++        break;
+     CASE_OP_32_64(ld8u):
+         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
++        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
++        break;
++    CASE_OP_32_64(ld16s):
++        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
+         break;
+     CASE_OP_32_64(ld16u):
+         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
++        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
++        break;
++    case INDEX_op_ld32s_i64:
++        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
+         break;
+     case INDEX_op_ld32u_i64:
+         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
++        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
+         break;
+     default:
+         g_assert_not_reached();
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             ctx.type = TCG_TYPE_I32;
+         }
+-        /* Assume all bits affected, and no bits known zero. */
++        /* Assume all bits affected, no bits known zero, no sign reps. */
+         ctx.a_mask = -1;
+         ctx.z_mask = -1;
++        ctx.s_mask = 0;
+         /*
+          * Process each opcode.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             done = fold_extu(&ctx, op);
+             break;
++        CASE_OP_32_64(ld8s):
+         CASE_OP_32_64(ld8u):
++        CASE_OP_32_64(ld16s):
+         CASE_OP_32_64(ld16u):
++        case INDEX_op_ld32s_i64:
+         case INDEX_op_ld32u_i64:
+             done = fold_tcg_ld(&ctx, op);
+             break;
 --
 .25.1

-[PULL 16/24] target/ppc: Drop exit checks for singlestep_enabled
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-GDB single-stepping is now handled generically.
+Sign repetitions are perforce all identical, whether they are 1 or 0.
-Reuse gen_debug_exception to handle architectural debug exceptions.
+Bitwise operations preserve the relative quantity of the repetitions.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/ppc/translate.c | 38 ++++++++------------------------------
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
-file changed, 8 insertions(+), 30 deletions(-)
+file changed, 29 insertions(+)
-diff --git a/target/ppc/translate.c b/target/ppc/translate.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/ppc/translate.c
+--- a/tcg/optimize.c
-+++ b/target/ppc/translate.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+     z2 = arg_info(op->args[2])->z_mask;
- #define CPU_SINGLE_STEP 0x1
+     ctx->z_mask = z1 & z2;
- #define CPU_BRANCH_STEP 0x2
--#define GDBSTUB_SINGLE_STEP 0x4
++    /*
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
- /* Include definitions for instructions classes and implementations flags */
++     * Bitwise operations preserve the relative quantity of the repetitions.
- /* #define PPC_DEBUG_DISAS */
++     */
-@@ -XXX,XX +XXX,XX @@ static uint32_t gen_prep_dbgex(DisasContext *ctx)
++    ctx->s_mask = arg_info(op->args[1])->s_mask
++                & arg_info(op->args[2])->s_mask;
- static void gen_debug_exception(DisasContext *ctx)
++
- {
+     /*
--    gen_helper_raise_exception(cpu_env, tcg_constant_i32(EXCP_DEBUG));
+      * Known-zeros does not imply known-ones.  Therefore unless
-+    gen_helper_raise_exception(cpu_env, tcg_constant_i32(gen_prep_dbgex(ctx)));
+      * arg2 is constant, we can't infer affected bits from it.
-     ctx->base.is_jmp = DISAS_NORETURN;
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z1;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
-@@ -XXX,XX +XXX,XX @@ static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+         fold_xi_to_not(ctx, op, 0)) {
- static void gen_lookup_and_goto_ptr(DisasContext *ctx)
+         return true;
  {
 -    int sse = ctx->singlestep_enabled;
 -    if (unlikely(sse)) {
 -        if (sse & GDBSTUB_SINGLE_STEP) {
 -            gen_debug_exception(ctx);
 -        } else if (sse & (CPU_SINGLE_STEP | CPU_BRANCH_STEP)) {
 -            gen_helper_raise_exception(cpu_env, tcg_constant_i32(gen_prep_dbgex(ctx)));
 -        } else {
 -            tcg_gen_exit_tb(NULL, 0);
 -        }
 +    if (unlikely(ctx->singlestep_enabled)) {
 +        gen_debug_exception(ctx);
      } else {
          tcg_gen_lookup_and_goto_ptr();
      }
-@@ -XXX,XX +XXX,XX @@ static void ppc_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
++
-     ctx->singlestep_enabled = 0;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-     if ((hflags >> HFLAGS_SE) & 1) {
++                & arg_info(op->args[2])->s_mask;
-         ctx->singlestep_enabled |= CPU_SINGLE_STEP;
+     return false;
-+        ctx->base.max_insns = 1;
+ }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[3])->z_mask
                  | arg_info(op->args[4])->z_mask;
 +    ctx->s_mask = arg_info(op->args[3])->s_mask
 +                & arg_info(op->args[4])->s_mask;
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
-     if ((hflags >> HFLAGS_BE) & 1) {
++
-         ctx->singlestep_enabled |= CPU_BRANCH_STEP;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
--    if (unlikely(ctx->base.singlestep_enabled)) {
++
--        ctx->singlestep_enabled |= GDBSTUB_SINGLE_STEP;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--    }
++                & arg_info(op->args[2])->s_mask;
--
+     return false;
 -    if (ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP)) {
 -        ctx->base.max_insns = 1;
 -    }
  }
- static void ppc_tr_tb_start(DisasContextBase *db, CPUState *cs)
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
-@@ -XXX,XX +XXX,XX @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+         return true;
      DisasContext *ctx = container_of(dcbase, DisasContext, base);
      DisasJumpType is_jmp = ctx->base.is_jmp;
      target_ulong nip = ctx->base.pc_next;
 -    int sse;
      if (is_jmp == DISAS_NORETURN) {
          /* We have already exited the TB. */
@@ -XXX,XX +XXX,XX @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
      }
-     /* Honor single stepping. */
++    ctx->s_mask = arg_info(op->args[1])->s_mask;
--    sse = ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP);
++
--    if (unlikely(sse)) {
+     /* Because of fold_to_not, we want to always return true, via finish. */
-+    if (unlikely(ctx->singlestep_enabled & CPU_SINGLE_STEP)
+     finish_folding(ctx, op);
-+        && (nip <= 0x100 || nip > 0xf00)) {
+     return true;
-         switch (is_jmp) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
-         case DISAS_TOO_MANY:
-         case DISAS_EXIT_UPDATE:
+     ctx->z_mask = arg_info(op->args[1])->z_mask
-@@ -XXX,XX +XXX,XX @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
+                 | arg_info(op->args[2])->z_mask;
-             g_assert_not_reached();
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-         }
++                & arg_info(op->args[2])->s_mask;
+     return fold_masks(ctx, op);
--        if (sse & GDBSTUB_SINGLE_STEP) {
+ }
--            gen_debug_exception(ctx);
--            return;
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
--        }
+         fold_ix_to_not(ctx, op, 0)) {
--        /* else CPU_SINGLE_STEP... */
+         return true;
 -        if (nip <= 0x100 || nip > 0xf00) {
 -            gen_helper_raise_exception(cpu_env, tcg_constant_i32(gen_prep_dbgex(ctx)));
 -            return;
 -        }
 +        gen_debug_exception(ctx);
 +        return;
      }
++
-     switch (is_jmp) {
++    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
      ctx->z_mask = arg_info(op->args[1])->z_mask
                  | arg_info(op->args[2])->z_mask;
 +    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return fold_masks(ctx, op);
  }
 --
 .25.1

-[PULL 06/24] target/arm: Drop checks for singlestep_enabled
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-GDB single-stepping is now handled generically.
+The result is either 0 or 1, which means that we have
 a 2 bit signed result, and thus 62 bits of sign.
 For clarity, use the smask_from_zmask function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/arm/translate-a64.c | 10 ++--------
+ tcg/optimize.c | 2 ++
- target/arm/translate.c     | 36 ++++++------------------------------
+file changed, 2 insertions(+)
 files changed, 8 insertions(+), 38 deletions(-)
-diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/arm/translate-a64.c
+--- a/tcg/optimize.c
-+++ b/target/arm/translate-a64.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
-         gen_a64_set_pc_im(dest);
+     }
-         if (s->ss_active) {
-             gen_step_complete_exception(s);
+     ctx->z_mask = 1;
--        } else if (s->base.singlestep_enabled) {
++    ctx->s_mask = smask_from_zmask(1);
--            gen_exception_internal(EXCP_DEBUG);
+     return false;
          } else {
              tcg_gen_lookup_and_goto_ptr();
              s->base.is_jmp = DISAS_NORETURN;
@@ -XXX,XX +XXX,XX @@ static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
  {
      DisasContext *dc = container_of(dcbase, DisasContext, base);
 -    if (unlikely(dc->base.singlestep_enabled || dc->ss_active)) {
 +    if (unlikely(dc->ss_active)) {
          /* Note that this means single stepping WFI doesn't halt the CPU.
           * For conditional branch insns this is harmless unreachable code as
           * gen_goto_tb() has already handled emitting the debug exception
@@ -XXX,XX +XXX,XX @@ static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
              /* fall through */
          case DISAS_EXIT:
          case DISAS_JUMP:
 -            if (dc->base.singlestep_enabled) {
 -                gen_exception_internal(EXCP_DEBUG);
 -            } else {
 -                gen_step_complete_exception(dc);
 -            }
 +            gen_step_complete_exception(dc);
              break;
          case DISAS_NORETURN:
              break;
 diff --git a/target/arm/translate.c b/target/arm/translate.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/arm/translate.c
 +++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_exception_internal(int excp)
      tcg_temp_free_i32(tcg_excp);
  }
--static void gen_step_complete_exception(DisasContext *s)
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 +static void gen_singlestep_exception(DisasContext *s)
  {
      /* We just completed step of an insn. Move from Active-not-pending
       * to Active-pending, and then also take the swstep exception.
@@ -XXX,XX +XXX,XX @@ static void gen_step_complete_exception(DisasContext *s)
      s->base.is_jmp = DISAS_NORETURN;
  }
 -static void gen_singlestep_exception(DisasContext *s)
 -{
 -    /* Generate the right kind of exception for singlestep, which is
 -     * either the architectural singlestep or EXCP_DEBUG for QEMU's
 -     * gdb singlestepping.
 -     */
 -    if (s->ss_active) {
 -        gen_step_complete_exception(s);
 -    } else {
 -        gen_exception_internal(EXCP_DEBUG);
 -    }
 -}
 -
 -static inline bool is_singlestepping(DisasContext *s)
 -{
 -    /* Return true if we are singlestepping either because of
 -     * architectural singlestep or QEMU gdbstub singlestep. This does
 -     * not include the command line '-singlestep' mode which is rather
 -     * misnamed as it only means "one instruction per TB" and doesn't
 -     * affect the code we generate.
 -     */
 -    return s->base.singlestep_enabled || s->ss_active;
 -}
 -
  void clear_eci_state(DisasContext *s)
  {
      /*
@@ -XXX,XX +XXX,XX @@ static inline void gen_bx_excret_final_code(DisasContext *s)
      /* Is the new PC value in the magic range indicating exception return? */
      tcg_gen_brcondi_i32(TCG_COND_GEU, cpu_R[15], min_magic, excret_label);
      /* No: end the TB as we would for a DISAS_JMP */
 -    if (is_singlestepping(s)) {
 +    if (s->ss_active) {
          gen_singlestep_exception(s);
      } else {
          tcg_gen_exit_tb(NULL, 0);
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
  /* Jump, specifying which TB number to use if we gen_goto_tb() */
  static inline void gen_jmp_tb(DisasContext *s, uint32_t dest, int tbno)
  {
 -    if (unlikely(is_singlestepping(s))) {
 +    if (unlikely(s->ss_active)) {
          /* An indirect jump so that we still trigger the debug exception.  */
          gen_set_pc_im(s, dest);
          s->base.is_jmp = DISAS_JUMP;
@@ -XXX,XX +XXX,XX @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
      dc->page_start = dc->base.pc_first & TARGET_PAGE_MASK;
      /* If architectural single step active, limit to 1.  */
 -    if (is_singlestepping(dc)) {
 +    if (dc->ss_active) {
          dc->base.max_insns = 1;
      }
-@@ -XXX,XX +XXX,XX @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
+     ctx->z_mask = 1;
-          * insn codepath itself.
++    ctx->s_mask = smask_from_zmask(1);
-          */
+     return false;
-         gen_bx_excret_final_code(dc);
--    } else if (unlikely(is_singlestepping(dc))) {
+  do_setcond_const:
 +    } else if (unlikely(dc->ss_active)) {
          /* Unconditional and "condition passed" instruction codepath. */
          switch (dc->base.is_jmp) {
          case DISAS_SWI:
@@ -XXX,XX +XXX,XX @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
          /* "Condition failed" instruction codepath for the branch/trap insn */
          gen_set_label(dc->condlabel);
          gen_set_condexec(dc);
 -        if (unlikely(is_singlestepping(dc))) {
 +        if (unlikely(dc->ss_active)) {
              gen_set_pc_im(dc, dc->base.pc_next);
              gen_singlestep_exception(dc);
          } else {
 --
 .25.1

-[PULL 17/24] target/riscv: Remove dead code after exception
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-We have already set DISAS_NORETURN in generate_exception,
+The results are generally 6 bit unsigned values, though
-which makes the exit_tb unreachable.
+the count leading and trailing bits may produce any value
 for a zero input.
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- target/riscv/insn_trans/trans_privileged.c.inc | 6 +-----
+ tcg/optimize.c | 3 ++-
-file changed, 1 insertion(+), 5 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/target/riscv/insn_trans/trans_privileged.c.inc
+--- a/tcg/optimize.c
-+++ b/target/riscv/insn_trans/trans_privileged.c.inc
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ static bool trans_ecall(DisasContext *ctx, arg_ecall *a)
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
- {
+         g_assert_not_reached();
-     /* always generates U-level ECALL, fixed in do_interrupt handler */
+     }
-     generate_exception(ctx, RISCV_EXCP_U_ECALL);
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
--    exit_tb(ctx); /* no chaining */
+-
--    ctx->base.is_jmp = DISAS_NORETURN;
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
-     return true;
+     return false;
  }
-@@ -XXX,XX +XXX,XX @@ static bool trans_ebreak(DisasContext *ctx, arg_ebreak *a)
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-         post   = opcode_at(&ctx->base, post_addr);
+     default:
          g_assert_not_reached();
      }
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
--    if  (pre == 0x01f01013 && ebreak == 0x00100073 && post == 0x40705013) {
+     return false;
 +    if (pre == 0x01f01013 && ebreak == 0x00100073 && post == 0x40705013) {
          generate_exception(ctx, RISCV_EXCP_SEMIHOST);
      } else {
          generate_exception(ctx, RISCV_EXCP_BREAKPOINT);
      }
 -    exit_tb(ctx); /* no chaining */
 -    ctx->base.is_jmp = DISAS_NORETURN;
      return true;
  }
 --
 .25.1

-[PULL 01/24] accel/tcg: Handle gdb singlestep in cpu_tb_exec
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-Currently the change in cpu_tb_exec is masked by the debug exception
+For constant shifts, we can simply shift the s_mask.
 being raised by the translators.  But this allows us to remove that code.
+For variable shifts, we know that sar does not reduce
+the s_mask, which helps for sequences like
+    ext32s_i64  t, in
+    sar_i64     t, t, v
+    ext32s_i64  out, t
+allowing the final extend to be eliminated.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cpu-exec.c | 11 +++++++++++
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
-file changed, 11 insertions(+)
+file changed, 47 insertions(+), 3 deletions(-)
-diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cpu-exec.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/cpu-exec.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
-             cc->set_pc(cpu, last_tb->pc);
+     return ~(~0ull >> rep);
-         }
+ }
 +/*
 + * Recreate a properly left-aligned smask after manipulation.
 + * Some bit-shuffling, particularly shifts and rotates, may
 + * retain sign bits on the left, but may scatter disconnected
 + * sign bits on the right.  Retain only what remains to the left.
 + */
 +static uint64_t smask_from_smask(int64_t smask)
 +{
 +    /* Only the 1 bits are significant for smask */
 +    return smask_from_zmask(~smask);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t s_mask, z_mask, sign;
 +
      if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +
      if (arg_is_const(op->args[2])) {
 -        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 -                                          arg_info(op->args[1])->z_mask,
 -                                          arg_info(op->args[2])->val);
 +        int sh = arg_info(op->args[2])->val;
 +
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 +
 +        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 +        ctx->s_mask = smask_from_smask(s_mask);
 +
          return fold_masks(ctx, op);
      }
 +
-+    /*
++    switch (op->opc) {
-+     * If gdb single-step, and we haven't raised another exception,
++    CASE_OP_32_64(sar):
-+     * raise a debug exception.  Single-step with another exception
++        /*
-+     * is handled in cpu_handle_exception.
++         * Arithmetic right shift will not reduce the number of
-+     */
++         * input sign repetitions.
-+    if (unlikely(cpu->singlestep_enabled) && cpu->exception_index == -1) {
++         */
-+        cpu->exception_index = EXCP_DEBUG;
++        ctx->s_mask = s_mask;
-+        cpu_loop_exit(cpu);
++        break;
 +    CASE_OP_32_64(shr):
 +        /*
 +         * If the sign bit is known zero, then logical right shift
 +         * will not reduced the number of input sign repetitions.
 +         */
 +        sign = (s_mask & -s_mask) >> 1;
 +        if (!(z_mask & sign)) {
 +            ctx->s_mask = s_mask;
 +        }
 +        break;
 +    default:
 +        break;
 +    }
 +
-     return last_tb;
+     return false;
  }
 --
 .25.1

The following changes since commit 6587b0c1331d427b0939c37e763842550ed581db:

Merge remote-tracking branch 'remotes/ericb/tags/pull-nbd-2021-10-15' into staging (2021-10-15 14:16:28 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211016

for you to fetch changes up to 995b87dedc78b0467f5f18bbc3546072ba97516a:

Revert "cpu: Move cpu_common_props to hw/core/cpu.c" (2021-10-15 16:39:15 -0700)

----------------------------------------------------------------
Move gdb singlestep to generic code
Fix cpu_common_props

----------------------------------------------------------------
Richard Henderson (24):
      accel/tcg: Handle gdb singlestep in cpu_tb_exec
      target/alpha: Drop checks for singlestep_enabled
      target/avr: Drop checks for singlestep_enabled
      target/cris: Drop checks for singlestep_enabled
      target/hexagon: Drop checks for singlestep_enabled
      target/arm: Drop checks for singlestep_enabled
      target/hppa: Drop checks for singlestep_enabled
      target/i386: Check CF_NO_GOTO_TB for dc->jmp_opt
      target/i386: Drop check for singlestep_enabled
      target/m68k: Drop checks for singlestep_enabled
      target/microblaze: Check CF_NO_GOTO_TB for DISAS_JUMP
      target/microblaze: Drop checks for singlestep_enabled
      target/mips: Fix single stepping
      target/mips: Drop exit checks for singlestep_enabled
      target/openrisc: Drop checks for singlestep_enabled
      target/ppc: Drop exit checks for singlestep_enabled
      target/riscv: Remove dead code after exception
      target/riscv: Remove exit_tb and lookup_and_goto_ptr
      target/rx: Drop checks for singlestep_enabled
      target/s390x: Drop check for singlestep_enabled
      target/sh4: Drop check for singlestep_enabled
      target/tricore: Drop check for singlestep_enabled
      target/xtensa: Drop check for singlestep_enabled
      Revert "cpu: Move cpu_common_props to hw/core/cpu.c"

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/alpha/translate.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -XXX,XX +XXX,XX @@ static void alpha_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
         tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
         /* FALLTHRU */
     case DISAS_PC_UPDATED:
-        if (!ctx->base.singlestep_enabled) {
-            tcg_gen_lookup_and_goto_ptr();
-            break;
-        }
-        /* FALLTHRU */
+        tcg_gen_lookup_and_goto_ptr();
+        break;
     case DISAS_PC_UPDATED_NOCHAIN:
-        if (ctx->base.singlestep_enabled) {
-            gen_excp_1(EXCP_DEBUG, 0);
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+        tcg_gen_exit_tb(NULL, 0);
         break;
     default:
         g_assert_not_reached();
-- 
2.25.1

GDB single-stepping is now handled generically.

Tested-by: Michael Rolnik <mrolnik@gmail.com>
Reviewed-by: Michael Rolnik <mrolnik@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/avr/translate.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/target/avr/translate.c b/target/avr/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/avr/translate.c
+++ b/target/avr/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
         tcg_gen_exit_tb(tb, n);
     } else {
         tcg_gen_movi_i32(cpu_pc, dest);
-        if (ctx->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
     }
     ctx->base.is_jmp = DISAS_NORETURN;
 }
@@ -XXX,XX +XXX,XX @@ static void avr_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
         tcg_gen_movi_tl(cpu_pc, ctx->npc);
         /* fall through */
     case DISAS_LOOKUP:
-        if (!ctx->base.singlestep_enabled) {
-            tcg_gen_lookup_and_goto_ptr();
-            break;
-        }
-        /* fall through */
+        tcg_gen_lookup_and_goto_ptr();
+        break;
     case DISAS_EXIT:
-        if (ctx->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+        tcg_gen_exit_tb(NULL, 0);
         break;
     default:
         g_assert_not_reached();
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/translate.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_end_tb(DisasContext *ctx)
 {
     gen_exec_counters(ctx);
     tcg_gen_mov_tl(hex_gpr[HEX_REG_PC], hex_next_PC);
-    if (ctx->base.singlestep_enabled) {
-        gen_exception_raw(EXCP_DEBUG);
-    } else {
-        tcg_gen_exit_tb(NULL, 0);
-    }
+    tcg_gen_exit_tb(NULL, 0);
     ctx->base.is_jmp = DISAS_NORETURN;
 }
 
@@ -XXX,XX +XXX,XX @@ static void hexagon_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
     case DISAS_TOO_MANY:
         gen_exec_counters(ctx);
         tcg_gen_movi_tl(hex_gpr[HEX_REG_PC], ctx->base.pc_next);
-        if (ctx->base.singlestep_enabled) {
-            gen_exception_raw(EXCP_DEBUG);
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+        tcg_gen_exit_tb(NULL, 0);
         break;
     case DISAS_NORETURN:
         break;
-- 
2.25.1

GDB single-stepping is now handled generically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-a64.c | 10 ++--------
 target/arm/translate.c     | 36 ++++++------------------------------
 2 files changed, 8 insertions(+), 38 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -XXX,XX +XXX,XX @@ static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
         gen_a64_set_pc_im(dest);
         if (s->ss_active) {
             gen_step_complete_exception(s);
-        } else if (s->base.singlestep_enabled) {
-            gen_exception_internal(EXCP_DEBUG);
         } else {
             tcg_gen_lookup_and_goto_ptr();
             s->base.is_jmp = DISAS_NORETURN;
@@ -XXX,XX +XXX,XX @@ static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
 
-    if (unlikely(dc->base.singlestep_enabled || dc->ss_active)) {
+    if (unlikely(dc->ss_active)) {
         /* Note that this means single stepping WFI doesn't halt the CPU.
          * For conditional branch insns this is harmless unreachable code as
          * gen_goto_tb() has already handled emitting the debug exception
@@ -XXX,XX +XXX,XX @@ static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
             /* fall through */
         case DISAS_EXIT:
         case DISAS_JUMP:
-            if (dc->base.singlestep_enabled) {
-                gen_exception_internal(EXCP_DEBUG);
-            } else {
-                gen_step_complete_exception(dc);
-            }
+            gen_step_complete_exception(dc);
             break;
         case DISAS_NORETURN:
             break;
diff --git a/target/arm/translate.c b/target/arm/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_exception_internal(int excp)
     tcg_temp_free_i32(tcg_excp);
 }
 
-static void gen_step_complete_exception(DisasContext *s)
+static void gen_singlestep_exception(DisasContext *s)
 {
     /* We just completed step of an insn. Move from Active-not-pending
      * to Active-pending, and then also take the swstep exception.
@@ -XXX,XX +XXX,XX @@ static void gen_step_complete_exception(DisasContext *s)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_singlestep_exception(DisasContext *s)
-{
-    /* Generate the right kind of exception for singlestep, which is
-     * either the architectural singlestep or EXCP_DEBUG for QEMU's
-     * gdb singlestepping.
-     */
-    if (s->ss_active) {
-        gen_step_complete_exception(s);
-    } else {
-        gen_exception_internal(EXCP_DEBUG);
-    }
-}
-
-static inline bool is_singlestepping(DisasContext *s)
-{
-    /* Return true if we are singlestepping either because of
-     * architectural singlestep or QEMU gdbstub singlestep. This does
-     * not include the command line '-singlestep' mode which is rather
-     * misnamed as it only means "one instruction per TB" and doesn't
-     * affect the code we generate.
-     */
-    return s->base.singlestep_enabled || s->ss_active;
-}
-
 void clear_eci_state(DisasContext *s)
 {
     /*
@@ -XXX,XX +XXX,XX @@ static inline void gen_bx_excret_final_code(DisasContext *s)
     /* Is the new PC value in the magic range indicating exception return? */
     tcg_gen_brcondi_i32(TCG_COND_GEU, cpu_R[15], min_magic, excret_label);
     /* No: end the TB as we would for a DISAS_JMP */
-    if (is_singlestepping(s)) {
+    if (s->ss_active) {
         gen_singlestep_exception(s);
     } else {
         tcg_gen_exit_tb(NULL, 0);
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
 /* Jump, specifying which TB number to use if we gen_goto_tb() */
 static inline void gen_jmp_tb(DisasContext *s, uint32_t dest, int tbno)
 {
-    if (unlikely(is_singlestepping(s))) {
+    if (unlikely(s->ss_active)) {
         /* An indirect jump so that we still trigger the debug exception.  */
         gen_set_pc_im(s, dest);
         s->base.is_jmp = DISAS_JUMP;
@@ -XXX,XX +XXX,XX @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
     dc->page_start = dc->base.pc_first & TARGET_PAGE_MASK;
 
     /* If architectural single step active, limit to 1.  */
-    if (is_singlestepping(dc)) {
+    if (dc->ss_active) {
         dc->base.max_insns = 1;
     }
 
@@ -XXX,XX +XXX,XX @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
          * insn codepath itself.
          */
         gen_bx_excret_final_code(dc);
-    } else if (unlikely(is_singlestepping(dc))) {
+    } else if (unlikely(dc->ss_active)) {
         /* Unconditional and "condition passed" instruction codepath. */
         switch (dc->base.is_jmp) {
         case DISAS_SWI:
@@ -XXX,XX +XXX,XX @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
         /* "Condition failed" instruction codepath for the branch/trap insn */
         gen_set_label(dc->condlabel);
         gen_set_condexec(dc);
-        if (unlikely(is_singlestepping(dc))) {
+        if (unlikely(dc->ss_active)) {
             gen_set_pc_im(dc, dc->base.pc_next);
             gen_singlestep_exception(dc);
         } else {
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hppa/translate.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int which,
     } else {
         copy_iaoq_entry(cpu_iaoq_f, f, cpu_iaoq_b);
         copy_iaoq_entry(cpu_iaoq_b, b, ctx->iaoq_n_var);
-        if (ctx->base.singlestep_enabled) {
-            gen_excp_1(EXCP_DEBUG);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool do_rfi(DisasContext *ctx, bool rfi_r)
         gen_helper_rfi(cpu_env);
     }
     /* Exit the TB to recognize new interrupts.  */
-    if (ctx->base.singlestep_enabled) {
-        gen_excp_1(EXCP_DEBUG);
-    } else {
-        tcg_gen_exit_tb(NULL, 0);
-    }
+    tcg_gen_exit_tb(NULL, 0);
     ctx->base.is_jmp = DISAS_NORETURN;
 
     return nullify_end(ctx);
@@ -XXX,XX +XXX,XX @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
         nullify_save(ctx);
         /* FALLTHRU */
     case DISAS_IAQ_N_UPDATED:
-        if (ctx->base.singlestep_enabled) {
-            gen_excp_1(EXCP_DEBUG);
-        } else if (is_jmp != DISAS_IAQ_N_STALE_EXIT) {
+        if (is_jmp != DISAS_IAQ_N_STALE_EXIT) {
             tcg_gen_lookup_and_goto_ptr();
+            break;
         }
         /* FALLTHRU */
     case DISAS_EXIT:
-- 
2.25.1

We were using singlestep_enabled as a proxy for whether
translator_use_goto_tb would always return false.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/tcg/translate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
     DisasContext *dc = container_of(dcbase, DisasContext, base);
     CPUX86State *env = cpu->env_ptr;
     uint32_t flags = dc->base.tb->flags;
+    uint32_t cflags = tb_cflags(dc->base.tb);
     int cpl = (flags >> HF_CPL_SHIFT) & 3;
     int iopl = (flags >> IOPL_SHIFT) & 3;
 
@@ -XXX,XX +XXX,XX @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
     dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX];
     dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX];
     dc->cpuid_xsave_features = env->features[FEAT_XSAVE];
-    dc->jmp_opt = !(dc->base.singlestep_enabled ||
+    dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) ||
                     (flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));
     /*
      * If jmp_opt, we want to handle each string instruction individually.
      * For icount also disable repz optimization so that each iteration
      * is accounted separately.
      */
-    dc->repz_opt = !dc->jmp_opt && !(tb_cflags(dc->base.tb) & CF_USE_ICOUNT);
+    dc->repz_opt = !dc->jmp_opt && !(cflags & CF_USE_ICOUNT);
 
     dc->T0 = tcg_temp_new();
     dc->T1 = tcg_temp_new();
-- 
2.25.1

GDB single-stepping is now handled generically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/helper.h          | 1 -
 target/i386/tcg/misc_helper.c | 8 --------
 target/i386/tcg/translate.c   | 4 +---
 3 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/target/i386/helper.h b/target/i386/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_2(syscall, void, env, int)
 DEF_HELPER_2(sysret, void, env, int)
 #endif
 DEF_HELPER_FLAGS_2(pause, TCG_CALL_NO_WG, noreturn, env, int)
-DEF_HELPER_FLAGS_1(debug, TCG_CALL_NO_WG, noreturn, env)
 DEF_HELPER_1(reset_rf, void, env)
 DEF_HELPER_FLAGS_3(raise_interrupt, TCG_CALL_NO_WG, noreturn, env, int, int)
 DEF_HELPER_FLAGS_2(raise_exception, TCG_CALL_NO_WG, noreturn, env, int)
diff --git a/target/i386/tcg/misc_helper.c b/target/i386/tcg/misc_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/misc_helper.c
+++ b/target/i386/tcg/misc_helper.c
@@ -XXX,XX +XXX,XX @@ void QEMU_NORETURN helper_pause(CPUX86State *env, int next_eip_addend)
     do_pause(env);
 }
 
-void QEMU_NORETURN helper_debug(CPUX86State *env)
-{
-    CPUState *cs = env_cpu(env);
-
-    cs->exception_index = EXCP_DEBUG;
-    cpu_loop_exit(cs);
-}
-
 uint64_t helper_rdpkru(CPUX86State *env, uint32_t ecx)
 {
     if ((env->cr[4] & CR4_PKE_MASK) == 0) {
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ do_gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, bool jr)
     if (s->base.tb->flags & HF_RF_MASK) {
         gen_helper_reset_rf(cpu_env);
     }
-    if (s->base.singlestep_enabled) {
-        gen_helper_debug(cpu_env);
-    } else if (recheck_tf) {
+    if (recheck_tf) {
         gen_helper_rechecking_single_step(cpu_env);
         tcg_gen_exit_tb(NULL, 0);
     } else if (s->flags & HF_TF_MASK) {
-- 
2.25.1

GDB single-stepping is now handled generically.

Acked-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/m68k/translate.c | 44 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 35 deletions(-)

diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -XXX,XX +XXX,XX @@ static void do_writebacks(DisasContext *s)
     }
 }
 
-static bool is_singlestepping(DisasContext *s)
-{
-    /*
-     * Return true if we are singlestepping either because of
-     * architectural singlestep or QEMU gdbstub singlestep. This does
-     * not include the command line '-singlestep' mode which is rather
-     * misnamed as it only means "one instruction per TB" and doesn't
-     * affect the code we generate.
-     */
-    return s->base.singlestep_enabled || s->ss_active;
-}
-
 /* is_jmp field values */
 #define DISAS_JUMP      DISAS_TARGET_0 /* only pc was modified dynamically */
 #define DISAS_EXIT      DISAS_TARGET_1 /* cpu state was modified dynamically */
@@ -XXX,XX +XXX,XX @@ static void gen_exception(DisasContext *s, uint32_t dest, int nr)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_singlestep_exception(DisasContext *s)
-{
-    /*
-     * Generate the right kind of exception for singlestep, which is
-     * either the architectural singlestep or EXCP_DEBUG for QEMU's
-     * gdb singlestepping.
-     */
-    if (s->ss_active) {
-        gen_raise_exception(EXCP_TRACE);
-    } else {
-        gen_raise_exception(EXCP_DEBUG);
-    }
-}
-
 static inline void gen_addr_fault(DisasContext *s)
 {
     gen_exception(s, s->base.pc_next, EXCP_ADDRESS);
@@ -XXX,XX +XXX,XX @@ static void gen_exit_tb(DisasContext *s)
 /* Generate a jump to an immediate address.  */
 static void gen_jmp_tb(DisasContext *s, int n, uint32_t dest)
 {
-    if (unlikely(is_singlestepping(s))) {
+    if (unlikely(s->ss_active)) {
         update_cc_op(s);
         tcg_gen_movi_i32(QREG_PC, dest);
-        gen_singlestep_exception(s);
+        gen_raise_exception(EXCP_TRACE);
     } else if (translator_use_goto_tb(&s->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(QREG_PC, dest);
@@ -XXX,XX +XXX,XX @@ static void m68k_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu)
 
     dc->ss_active = (M68K_SR_TRACE(env->sr) == M68K_SR_TRACE_ANY_INS);
     /* If architectural single step active, limit to 1 */
-    if (is_singlestepping(dc)) {
+    if (dc->ss_active) {
         dc->base.max_insns = 1;
     }
 }
@@ -XXX,XX +XXX,XX @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
         break;
     case DISAS_TOO_MANY:
         update_cc_op(dc);
-        if (is_singlestepping(dc)) {
+        if (dc->ss_active) {
             tcg_gen_movi_i32(QREG_PC, dc->pc);
-            gen_singlestep_exception(dc);
+            gen_raise_exception(EXCP_TRACE);
         } else {
             gen_jmp_tb(dc, 0, dc->pc);
         }
         break;
     case DISAS_JUMP:
         /* We updated CC_OP and PC in gen_jmp/gen_jmp_im.  */
-        if (is_singlestepping(dc)) {
-            gen_singlestep_exception(dc);
+        if (dc->ss_active) {
+            gen_raise_exception(EXCP_TRACE);
         } else {
             tcg_gen_lookup_and_goto_ptr();
         }
@@ -XXX,XX +XXX,XX @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
          * We updated CC_OP and PC in gen_exit_tb, but also modified
          * other state that may require returning to the main loop.
          */
-        if (is_singlestepping(dc)) {
-            gen_singlestep_exception(dc);
+        if (dc->ss_active) {
+            gen_raise_exception(EXCP_TRACE);
         } else {
             tcg_gen_exit_tb(NULL, 0);
         }
-- 
2.25.1

We were using singlestep_enabled as a proxy for whether
translator_use_goto_tb would always return false.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/microblaze/translate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static void mb_tr_tb_stop(DisasContextBase *dcb, CPUState *cs)
         break;
 
     case DISAS_JUMP:
-        if (dc->jmp_dest != -1 && !cs->singlestep_enabled) {
+        if (dc->jmp_dest != -1 && !(tb_cflags(dc->base.tb) & CF_NO_GOTO_TB)) {
             /* Direct jump. */
             tcg_gen_discard_i32(cpu_btarget);
 
@@ -XXX,XX +XXX,XX @@ static void mb_tr_tb_stop(DisasContextBase *dcb, CPUState *cs)
             return;
         }
 
-        /* Indirect jump (or direct jump w/ singlestep) */
+        /* Indirect jump (or direct jump w/ goto_tb disabled) */
         tcg_gen_mov_i32(cpu_pc, cpu_btarget);
         tcg_gen_discard_i32(cpu_btarget);
 
-- 
2.25.1

GDB single-stepping is now handled generically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/microblaze/translate.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_raise_hw_excp(DisasContext *dc, uint32_t esr_ec)
 
 static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
 {
-    if (dc->base.singlestep_enabled) {
-        TCGv_i32 tmp = tcg_const_i32(EXCP_DEBUG);
-        tcg_gen_movi_i32(cpu_pc, dest);
-        gen_helper_raise_exception(cpu_env, tmp);
-        tcg_temp_free_i32(tmp);
-    } else if (translator_use_goto_tb(&dc->base, dest)) {
+    if (translator_use_goto_tb(&dc->base, dest)) {
         tcg_gen_goto_tb(n);
         tcg_gen_movi_i32(cpu_pc, dest);
         tcg_gen_exit_tb(dc->base.tb, n);
@@ -XXX,XX +XXX,XX @@ static void mb_tr_tb_stop(DisasContextBase *dcb, CPUState *cs)
         /* Indirect jump (or direct jump w/ goto_tb disabled) */
         tcg_gen_mov_i32(cpu_pc, cpu_btarget);
         tcg_gen_discard_i32(cpu_btarget);
-
-        if (unlikely(cs->singlestep_enabled)) {
-            gen_raise_exception(dc, EXCP_DEBUG);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
         return;
 
     default:
-- 
2.25.1

As per an ancient comment in mips_tr_translate_insn about the
expectations of gdb, when restarting the insn in a delay slot
we also re-execute the branch.  Which means that we are
expected to execute two insns in this case.

This has been broken since 8b86d6d2580, where we forced max_insns
to 1 while single-stepping.  This resulted in an exit from the
translator loop after the branch but before the delay slot is
translated.

Increase the max_insns to 2 for this case.  In addition, bypass
the end-of-page check, for when the branch itself ends the page.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/tcg/translate.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void mips_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
     ctx->default_tcg_memop_mask = (ctx->insn_flags & (ISA_MIPS_R6 |
                                   INSN_LOONGSON3A)) ? MO_UNALN : MO_ALIGN;
 
+    /*
+     * Execute a branch and its delay slot as a single instruction.
+     * This is what GDB expects and is consistent with what the
+     * hardware does (e.g. if a delay slot instruction faults, the
+     * reported PC is the PC of the branch).
+     */
+    if (ctx->base.singlestep_enabled && (ctx->hflags & MIPS_HFLAG_BMASK)) {
+        ctx->base.max_insns = 2;
+    }
+
     LOG_DISAS("\ntb %p idx %d hflags %04x\n", ctx->base.tb, ctx->mem_idx,
               ctx->hflags);
 }
@@ -XXX,XX +XXX,XX @@ static void mips_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
     if (ctx->base.is_jmp != DISAS_NEXT) {
         return;
     }
+
     /*
-     * Execute a branch and its delay slot as a single instruction.
-     * This is what GDB expects and is consistent with what the
-     * hardware does (e.g. if a delay slot instruction faults, the
-     * reported PC is the PC of the branch).
+     * End the TB on (most) page crossings.
+     * See mips_tr_init_disas_context about single-stepping a branch
+     * together with its delay slot.
      */
-    if (ctx->base.singlestep_enabled &&
-        (ctx->hflags & MIPS_HFLAG_BMASK) == 0) {
-        ctx->base.is_jmp = DISAS_TOO_MANY;
-    }
-    if (ctx->base.pc_next - ctx->page_start >= TARGET_PAGE_SIZE) {
+    if (ctx->base.pc_next - ctx->page_start >= TARGET_PAGE_SIZE
+        && !ctx->base.singlestep_enabled) {
         ctx->base.is_jmp = DISAS_TOO_MANY;
     }
 }
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/mips/tcg/translate.c | 50 +++++++++++++------------------------
 1 file changed, 18 insertions(+), 32 deletions(-)

diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
         tcg_gen_exit_tb(ctx->base.tb, n);
     } else {
         gen_save_pc(dest);
-        if (ctx->base.singlestep_enabled) {
-            save_cpu_state(ctx, 0);
-            gen_helper_raise_exception_debug(cpu_env);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void gen_branch(DisasContext *ctx, int insn_bytes)
             } else {
                 tcg_gen_mov_tl(cpu_PC, btarget);
             }
-            if (ctx->base.singlestep_enabled) {
-                save_cpu_state(ctx, 0);
-                gen_helper_raise_exception_debug(cpu_env);
-            }
             tcg_gen_lookup_and_goto_ptr();
             break;
         default:
@@ -XXX,XX +XXX,XX @@ static void mips_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
 
-    if (ctx->base.singlestep_enabled && ctx->base.is_jmp != DISAS_NORETURN) {
-        save_cpu_state(ctx, ctx->base.is_jmp != DISAS_EXIT);
-        gen_helper_raise_exception_debug(cpu_env);
-    } else {
-        switch (ctx->base.is_jmp) {
-        case DISAS_STOP:
-            gen_save_pc(ctx->base.pc_next);
-            tcg_gen_lookup_and_goto_ptr();
-            break;
-        case DISAS_NEXT:
-        case DISAS_TOO_MANY:
-            save_cpu_state(ctx, 0);
-            gen_goto_tb(ctx, 0, ctx->base.pc_next);
-            break;
-        case DISAS_EXIT:
-            tcg_gen_exit_tb(NULL, 0);
-            break;
-        case DISAS_NORETURN:
-            break;
-        default:
-            g_assert_not_reached();
-        }
+    switch (ctx->base.is_jmp) {
+    case DISAS_STOP:
+        gen_save_pc(ctx->base.pc_next);
+        tcg_gen_lookup_and_goto_ptr();
+        break;
+    case DISAS_NEXT:
+    case DISAS_TOO_MANY:
+        save_cpu_state(ctx, 0);
+        gen_goto_tb(ctx, 0, ctx->base.pc_next);
+        break;
+    case DISAS_EXIT:
+        tcg_gen_exit_tb(NULL, 0);
+        break;
+    case DISAS_NORETURN:
+        break;
+    default:
+        g_assert_not_reached();
     }
 }
 
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/openrisc/translate.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/target/openrisc/translate.c b/target/openrisc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/openrisc/translate.c
+++ b/target/openrisc/translate.c
@@ -XXX,XX +XXX,XX @@ static void openrisc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
             /* The jump destination is indirect/computed; use jmp_pc.  */
             tcg_gen_mov_tl(cpu_pc, jmp_pc);
             tcg_gen_discard_tl(jmp_pc);
-            if (unlikely(dc->base.singlestep_enabled)) {
-                gen_exception(dc, EXCP_DEBUG);
-            } else {
-                tcg_gen_lookup_and_goto_ptr();
-            }
+            tcg_gen_lookup_and_goto_ptr();
             break;
         }
         /* The jump destination is direct; use jmp_pc_imm.
@@ -XXX,XX +XXX,XX @@ static void openrisc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
             break;
         }
         tcg_gen_movi_tl(cpu_pc, jmp_dest);
-        if (unlikely(dc->base.singlestep_enabled)) {
-            gen_exception(dc, EXCP_DEBUG);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
         break;
 
     case DISAS_EXIT:
-        if (unlikely(dc->base.singlestep_enabled)) {
-            gen_exception(dc, EXCP_DEBUG);
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+        tcg_gen_exit_tb(NULL, 0);
         break;
     default:
         g_assert_not_reached();
-- 
2.25.1

GDB single-stepping is now handled generically.
Reuse gen_debug_exception to handle architectural debug exceptions.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/translate.c | 38 ++++++++------------------------------
 1 file changed, 8 insertions(+), 30 deletions(-)

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -XXX,XX +XXX,XX @@
 
 #define CPU_SINGLE_STEP 0x1
 #define CPU_BRANCH_STEP 0x2
-#define GDBSTUB_SINGLE_STEP 0x4
 
 /* Include definitions for instructions classes and implementations flags */
 /* #define PPC_DEBUG_DISAS */
@@ -XXX,XX +XXX,XX @@ static uint32_t gen_prep_dbgex(DisasContext *ctx)
 
 static void gen_debug_exception(DisasContext *ctx)
 {
-    gen_helper_raise_exception(cpu_env, tcg_constant_i32(EXCP_DEBUG));
+    gen_helper_raise_exception(cpu_env, tcg_constant_i32(gen_prep_dbgex(ctx)));
     ctx->base.is_jmp = DISAS_NORETURN;
 }
 
@@ -XXX,XX +XXX,XX @@ static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
 
 static void gen_lookup_and_goto_ptr(DisasContext *ctx)
 {
-    int sse = ctx->singlestep_enabled;
-    if (unlikely(sse)) {
-        if (sse & GDBSTUB_SINGLE_STEP) {
-            gen_debug_exception(ctx);
-        } else if (sse & (CPU_SINGLE_STEP | CPU_BRANCH_STEP)) {
-            gen_helper_raise_exception(cpu_env, tcg_constant_i32(gen_prep_dbgex(ctx)));
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+    if (unlikely(ctx->singlestep_enabled)) {
+        gen_debug_exception(ctx);
     } else {
         tcg_gen_lookup_and_goto_ptr();
     }
@@ -XXX,XX +XXX,XX @@ static void ppc_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
     ctx->singlestep_enabled = 0;
     if ((hflags >> HFLAGS_SE) & 1) {
         ctx->singlestep_enabled |= CPU_SINGLE_STEP;
+        ctx->base.max_insns = 1;
     }
     if ((hflags >> HFLAGS_BE) & 1) {
         ctx->singlestep_enabled |= CPU_BRANCH_STEP;
     }
-    if (unlikely(ctx->base.singlestep_enabled)) {
-        ctx->singlestep_enabled |= GDBSTUB_SINGLE_STEP;
-    }
-
-    if (ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP)) {
-        ctx->base.max_insns = 1;
-    }
 }
 
 static void ppc_tr_tb_start(DisasContextBase *db, CPUState *cs)
@@ -XXX,XX +XXX,XX @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
     DisasJumpType is_jmp = ctx->base.is_jmp;
     target_ulong nip = ctx->base.pc_next;
-    int sse;
 
     if (is_jmp == DISAS_NORETURN) {
         /* We have already exited the TB. */
@@ -XXX,XX +XXX,XX @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
     }
 
     /* Honor single stepping. */
-    sse = ctx->singlestep_enabled & (CPU_SINGLE_STEP | GDBSTUB_SINGLE_STEP);
-    if (unlikely(sse)) {
+    if (unlikely(ctx->singlestep_enabled & CPU_SINGLE_STEP)
+        && (nip <= 0x100 || nip > 0xf00)) {
         switch (is_jmp) {
         case DISAS_TOO_MANY:
         case DISAS_EXIT_UPDATE:
@@ -XXX,XX +XXX,XX @@ static void ppc_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
             g_assert_not_reached();
         }
 
-        if (sse & GDBSTUB_SINGLE_STEP) {
-            gen_debug_exception(ctx);
-            return;
-        }
-        /* else CPU_SINGLE_STEP... */
-        if (nip <= 0x100 || nip > 0xf00) {
-            gen_helper_raise_exception(cpu_env, tcg_constant_i32(gen_prep_dbgex(ctx)));
-            return;
-        }
+        gen_debug_exception(ctx);
+        return;
     }
 
     switch (is_jmp) {
-- 
2.25.1

We have already set DISAS_NORETURN in generate_exception,
which makes the exit_tb unreachable.

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/insn_trans/trans_privileged.c.inc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/insn_trans/trans_privileged.c.inc
+++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_ecall(DisasContext *ctx, arg_ecall *a)
 {
     /* always generates U-level ECALL, fixed in do_interrupt handler */
     generate_exception(ctx, RISCV_EXCP_U_ECALL);
-    exit_tb(ctx); /* no chaining */
-    ctx->base.is_jmp = DISAS_NORETURN;
     return true;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool trans_ebreak(DisasContext *ctx, arg_ebreak *a)
         post   = opcode_at(&ctx->base, post_addr);
     }
 
-    if  (pre == 0x01f01013 && ebreak == 0x00100073 && post == 0x40705013) {
+    if (pre == 0x01f01013 && ebreak == 0x00100073 && post == 0x40705013) {
         generate_exception(ctx, RISCV_EXCP_SEMIHOST);
     } else {
         generate_exception(ctx, RISCV_EXCP_BREAKPOINT);
     }
-    exit_tb(ctx); /* no chaining */
-    ctx->base.is_jmp = DISAS_NORETURN;
     return true;
 }
 
-- 
2.25.1

GDB single-stepping is now handled generically, which means
we don't need to do anything in the wrappers.

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/riscv/translate.c                      | 27 +------------------
 .../riscv/insn_trans/trans_privileged.c.inc   |  4 +--
 target/riscv/insn_trans/trans_rvi.c.inc       |  8 +++---
 target/riscv/insn_trans/trans_rvv.c.inc       |  2 +-
 4 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -XXX,XX +XXX,XX @@ static void generate_exception_mtval(DisasContext *ctx, int excp)
     ctx->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_exception_debug(void)
-{
-    gen_helper_raise_exception(cpu_env, tcg_constant_i32(EXCP_DEBUG));
-}
-
-/* Wrapper around tcg_gen_exit_tb that handles single stepping */
-static void exit_tb(DisasContext *ctx)
-{
-    if (ctx->base.singlestep_enabled) {
-        gen_exception_debug();
-    } else {
-        tcg_gen_exit_tb(NULL, 0);
-    }
-}
-
-/* Wrapper around tcg_gen_lookup_and_goto_ptr that handles single stepping */
-static void lookup_and_goto_ptr(DisasContext *ctx)
-{
-    if (ctx->base.singlestep_enabled) {
-        gen_exception_debug();
-    } else {
-        tcg_gen_lookup_and_goto_ptr();
-    }
-}
-
 static void gen_exception_illegal(DisasContext *ctx)
 {
     generate_exception(ctx, RISCV_EXCP_ILLEGAL_INST);
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
         tcg_gen_exit_tb(ctx->base.tb, n);
     } else {
         tcg_gen_movi_tl(cpu_pc, dest);
-        lookup_and_goto_ptr(ctx);
+        tcg_gen_lookup_and_goto_ptr();
     }
 }
 
diff --git a/target/riscv/insn_trans/trans_privileged.c.inc b/target/riscv/insn_trans/trans_privileged.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/insn_trans/trans_privileged.c.inc
+++ b/target/riscv/insn_trans/trans_privileged.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_sret(DisasContext *ctx, arg_sret *a)
 
     if (has_ext(ctx, RVS)) {
         gen_helper_sret(cpu_pc, cpu_env, cpu_pc);
-        exit_tb(ctx); /* no chaining */
+        tcg_gen_exit_tb(NULL, 0); /* no chaining */
         ctx->base.is_jmp = DISAS_NORETURN;
     } else {
         return false;
@@ -XXX,XX +XXX,XX @@ static bool trans_mret(DisasContext *ctx, arg_mret *a)
 #ifndef CONFIG_USER_ONLY
     tcg_gen_movi_tl(cpu_pc, ctx->base.pc_next);
     gen_helper_mret(cpu_pc, cpu_env, cpu_pc);
-    exit_tb(ctx); /* no chaining */
+    tcg_gen_exit_tb(NULL, 0); /* no chaining */
     ctx->base.is_jmp = DISAS_NORETURN;
     return true;
 #else
diff --git a/target/riscv/insn_trans/trans_rvi.c.inc b/target/riscv/insn_trans/trans_rvi.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/insn_trans/trans_rvi.c.inc
+++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_jalr(DisasContext *ctx, arg_jalr *a)
     if (a->rd != 0) {
         tcg_gen_movi_tl(cpu_gpr[a->rd], ctx->pc_succ_insn);
     }
-
-    /* No chaining with JALR. */
-    lookup_and_goto_ptr(ctx);
+    tcg_gen_lookup_and_goto_ptr();
 
     if (misaligned) {
         gen_set_label(misaligned);
@@ -XXX,XX +XXX,XX @@ static bool trans_fence_i(DisasContext *ctx, arg_fence_i *a)
      * however we need to end the translation block
      */
     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn);
-    exit_tb(ctx);
+    tcg_gen_exit_tb(NULL, 0);
     ctx->base.is_jmp = DISAS_NORETURN;
     return true;
 }
@@ -XXX,XX +XXX,XX @@ static bool do_csr_post(DisasContext *ctx)
 {
     /* We may have changed important cpu state -- exit to main loop. */
     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn);
-    exit_tb(ctx);
+    tcg_gen_exit_tb(NULL, 0);
     ctx->base.is_jmp = DISAS_NORETURN;
     return true;
 }
diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index XXXXXXX..XXXXXXX 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -XXX,XX +XXX,XX @@ static bool trans_vsetvl(DisasContext *ctx, arg_vsetvl *a)
     gen_set_gpr(ctx, a->rd, dst);
 
     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn);
-    lookup_and_goto_ptr(ctx);
+    tcg_gen_lookup_and_goto_ptr();
     ctx->base.is_jmp = DISAS_NORETURN;
     return true;
 }
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/rx/helper.h    |  1 -
 target/rx/op_helper.c |  8 --------
 target/rx/translate.c | 12 ++----------
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/target/rx/helper.h b/target/rx/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/helper.h
+++ b/target/rx/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_1(raise_illegal_instruction, noreturn, env)
 DEF_HELPER_1(raise_access_fault, noreturn, env)
 DEF_HELPER_1(raise_privilege_violation, noreturn, env)
 DEF_HELPER_1(wait, noreturn, env)
-DEF_HELPER_1(debug, noreturn, env)
 DEF_HELPER_2(rxint, noreturn, env, i32)
 DEF_HELPER_1(rxbrk, noreturn, env)
 DEF_HELPER_FLAGS_3(fadd, TCG_CALL_NO_WG, f32, env, f32, f32)
diff --git a/target/rx/op_helper.c b/target/rx/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/op_helper.c
+++ b/target/rx/op_helper.c
@@ -XXX,XX +XXX,XX @@ void QEMU_NORETURN helper_wait(CPURXState *env)
     raise_exception(env, EXCP_HLT, 0);
 }
 
-void QEMU_NORETURN helper_debug(CPURXState *env)
-{
-    CPUState *cs = env_cpu(env);
-
-    cs->exception_index = EXCP_DEBUG;
-    cpu_loop_exit(cs);
-}
-
 void QEMU_NORETURN helper_rxint(CPURXState *env, uint32_t vec)
 {
     raise_exception(env, 0x100 + vec, 0);
diff --git a/target/rx/translate.c b/target/rx/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/rx/translate.c
+++ b/target/rx/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *dc, int n, target_ulong dest)
         tcg_gen_exit_tb(dc->base.tb, n);
     } else {
         tcg_gen_movi_i32(cpu_pc, dest);
-        if (dc->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
     }
     dc->base.is_jmp = DISAS_NORETURN;
 }
@@ -XXX,XX +XXX,XX @@ static void rx_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
         gen_goto_tb(ctx, 0, dcbase->pc_next);
         break;
     case DISAS_JUMP:
-        if (ctx->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
         break;
     case DISAS_UPDATE:
         tcg_gen_movi_i32(cpu_pc, ctx->base.pc_next);
-- 
2.25.1

GDB single-stepping is now handled generically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/translate.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -XXX,XX +XXX,XX @@ struct DisasContext {
     uint64_t pc_tmp;
     uint32_t ilen;
     enum cc_op cc_op;
-    bool do_debug;
 };
 
 /* Information carried about a condition to be evaluated.  */
@@ -XXX,XX +XXX,XX @@ static void s390x_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
 
     dc->cc_op = CC_OP_DYNAMIC;
     dc->ex_value = dc->base.tb->cs_base;
-    dc->do_debug = dc->base.singlestep_enabled;
 }
 
 static void s390x_tr_tb_start(DisasContextBase *db, CPUState *cs)
@@ -XXX,XX +XXX,XX @@ static void s390x_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
         /* FALLTHRU */
     case DISAS_PC_CC_UPDATED:
         /* Exit the TB, either by raising a debug exception or by return.  */
-        if (dc->do_debug) {
-            gen_exception(EXCP_DEBUG);
-        } else if ((dc->base.tb->flags & FLAG_MASK_PER) ||
-                   dc->base.is_jmp == DISAS_PC_STALE_NOCHAIN) {
+        if ((dc->base.tb->flags & FLAG_MASK_PER) ||
+             dc->base.is_jmp == DISAS_PC_STALE_NOCHAIN) {
             tcg_gen_exit_tb(NULL, 0);
         } else {
             tcg_gen_lookup_and_goto_ptr();
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/sh4/helper.h    |  1 -
 target/sh4/op_helper.c |  5 -----
 target/sh4/translate.c | 14 +++-----------
 3 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/target/sh4/helper.h b/target/sh4/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/helper.h
+++ b/target/sh4/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_1(raise_illegal_instruction, noreturn, env)
 DEF_HELPER_1(raise_slot_illegal_instruction, noreturn, env)
 DEF_HELPER_1(raise_fpu_disable, noreturn, env)
 DEF_HELPER_1(raise_slot_fpu_disable, noreturn, env)
-DEF_HELPER_1(debug, noreturn, env)
 DEF_HELPER_1(sleep, noreturn, env)
 DEF_HELPER_2(trapa, noreturn, env, i32)
 DEF_HELPER_1(exclusive, noreturn, env)
diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -XXX,XX +XXX,XX @@ void helper_raise_slot_fpu_disable(CPUSH4State *env)
     raise_exception(env, 0x820, 0);
 }
 
-void helper_debug(CPUSH4State *env)
-{
-    raise_exception(env, EXCP_DEBUG, 0);
-}
-
 void helper_sleep(CPUSH4State *env)
 {
     CPUState *cs = env_cpu(env);
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
         tcg_gen_exit_tb(ctx->base.tb, n);
     } else {
         tcg_gen_movi_i32(cpu_pc, dest);
-        if (ctx->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else if (use_exit_tb(ctx)) {
+        if (use_exit_tb(ctx)) {
             tcg_gen_exit_tb(NULL, 0);
         } else {
             tcg_gen_lookup_and_goto_ptr();
@@ -XXX,XX +XXX,XX @@ static void gen_jump(DisasContext * ctx)
 	   delayed jump as immediate jump are conditinal jumps */
 	tcg_gen_mov_i32(cpu_pc, cpu_delayed_pc);
         tcg_gen_discard_i32(cpu_delayed_pc);
-        if (ctx->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else if (use_exit_tb(ctx)) {
+        if (use_exit_tb(ctx)) {
             tcg_gen_exit_tb(NULL, 0);
         } else {
             tcg_gen_lookup_and_goto_ptr();
@@ -XXX,XX +XXX,XX @@ static void sh4_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
     switch (ctx->base.is_jmp) {
     case DISAS_STOP:
         gen_save_cpu_state(ctx, true);
-        if (ctx->base.singlestep_enabled) {
-            gen_helper_debug(cpu_env);
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+        tcg_gen_exit_tb(NULL, 0);
         break;
     case DISAS_NEXT:
     case DISAS_TOO_MANY:
-- 
2.25.1

GDB single-stepping is now handled generically.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/tricore/helper.h    |  1 -
 target/tricore/op_helper.c |  7 -------
 target/tricore/translate.c | 14 +-------------
 3 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/target/tricore/helper.h b/target/tricore/helper.h
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/helper.h
+++ b/target/tricore/helper.h
@@ -XXX,XX +XXX,XX @@ DEF_HELPER_2(psw_write, void, env, i32)
 DEF_HELPER_1(psw_read, i32, env)
 /* Exceptions */
 DEF_HELPER_3(raise_exception_sync, noreturn, env, i32, i32)
-DEF_HELPER_2(qemu_excp, noreturn, env, i32)
diff --git a/target/tricore/op_helper.c b/target/tricore/op_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/op_helper.c
+++ b/target/tricore/op_helper.c
@@ -XXX,XX +XXX,XX @@ static void raise_exception_sync_helper(CPUTriCoreState *env, uint32_t class,
     raise_exception_sync_internal(env, class, tin, pc, 0);
 }
 
-void helper_qemu_excp(CPUTriCoreState *env, uint32_t excp)
-{
-    CPUState *cs = env_cpu(env);
-    cs->exception_index = excp;
-    cpu_loop_exit(cs);
-}
-
 /* Addressing mode helper */
 
 static uint16_t reverse16(uint16_t val)
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -XXX,XX +XXX,XX @@ static inline void gen_save_pc(target_ulong pc)
     tcg_gen_movi_tl(cpu_PC, pc);
 }
 
-static void generate_qemu_excp(DisasContext *ctx, int excp)
-{
-    TCGv_i32 tmp = tcg_const_i32(excp);
-    gen_helper_qemu_excp(cpu_env, tmp);
-    ctx->base.is_jmp = DISAS_NORETURN;
-    tcg_temp_free(tmp);
-}
-
 static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
 {
     if (translator_use_goto_tb(&ctx->base, dest)) {
@@ -XXX,XX +XXX,XX @@ static void gen_goto_tb(DisasContext *ctx, int n, target_ulong dest)
         tcg_gen_exit_tb(ctx->base.tb, n);
     } else {
         gen_save_pc(dest);
-        if (ctx->base.singlestep_enabled) {
-            generate_qemu_excp(ctx, EXCP_DEBUG);
-        } else {
-            tcg_gen_lookup_and_goto_ptr();
-        }
+        tcg_gen_lookup_and_goto_ptr();
     }
 }
 
-- 
2.25.1

GDB single-stepping is now handled generically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/xtensa/translate.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index XXXXXXX..XXXXXXX 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -XXX,XX +XXX,XX @@ static void gen_jump_slot(DisasContext *dc, TCGv dest, int slot)
     if (dc->icount) {
         tcg_gen_mov_i32(cpu_SR[ICOUNT], dc->next_icount);
     }
-    if (dc->base.singlestep_enabled) {
-        gen_exception(dc, EXCP_DEBUG);
+    if (dc->op_flags & XTENSA_OP_POSTPROCESS) {
+        slot = gen_postprocess(dc, slot);
+    }
+    if (slot >= 0) {
+        tcg_gen_goto_tb(slot);
+        tcg_gen_exit_tb(dc->base.tb, slot);
     } else {
-        if (dc->op_flags & XTENSA_OP_POSTPROCESS) {
-            slot = gen_postprocess(dc, slot);
-        }
-        if (slot >= 0) {
-            tcg_gen_goto_tb(slot);
-            tcg_gen_exit_tb(dc->base.tb, slot);
-        } else {
-            tcg_gen_exit_tb(NULL, 0);
-        }
+        tcg_gen_exit_tb(NULL, 0);
     }
     dc->base.is_jmp = DISAS_NORETURN;
 }
@@ -XXX,XX +XXX,XX @@ static void xtensa_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
     case DISAS_NORETURN:
         break;
     case DISAS_TOO_MANY:
-        if (dc->base.singlestep_enabled) {
-            tcg_gen_movi_i32(cpu_pc, dc->pc);
-            gen_exception(dc, EXCP_DEBUG);
-        } else {
-            gen_jumpi(dc, dc->pc, 0);
-        }
+        gen_jumpi(dc, dc->pc, 0);
         break;
     default:
         g_assert_not_reached();
-- 
2.25.1

This reverts commit 1b36e4f5a5de585210ea95f2257839c2312be28f.

Despite a comment saying why cpu_common_props cannot be placed in
a file that is compiled once, it was moved anyway.  Revert that.

Since then, Property is not defined in hw/core/cpu.h, so it is now
easier to declare a function to install the properties rather than
the Property array itself.

Cc: Eduardo Habkost <ehabkost@redhat.com>
Suggested-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h |  1 +
 cpu.c                 | 21 +++++++++++++++++++++
 hw/core/cpu-common.c  | 17 +----------------
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ void QEMU_NORETURN cpu_abort(CPUState *cpu, const char *fmt, ...)
     GCC_FMT_ATTR(2, 3);
 
 /* $(top_srcdir)/cpu.c */
+void cpu_class_init_props(DeviceClass *dc);
 void cpu_exec_initfn(CPUState *cpu);
 void cpu_exec_realizefn(CPUState *cpu, Error **errp);
 void cpu_exec_unrealizefn(CPUState *cpu);
diff --git a/cpu.c b/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/cpu.c
+++ b/cpu.c
@@ -XXX,XX +XXX,XX @@ void cpu_exec_unrealizefn(CPUState *cpu)
     cpu_list_remove(cpu);
 }
 
+static Property cpu_common_props[] = {
+#ifndef CONFIG_USER_ONLY
+    /*
+     * Create a memory property for softmmu CPU object,
+     * so users can wire up its memory. (This can't go in hw/core/cpu.c
+     * because that file is compiled only once for both user-mode
+     * and system builds.) The default if no link is set up is to use
+     * the system address space.
+     */
+    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
+                     MemoryRegion *),
+#endif
+    DEFINE_PROP_BOOL("start-powered-off", CPUState, start_powered_off, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+void cpu_class_init_props(DeviceClass *dc)
+{
+    device_class_set_props(dc, cpu_common_props);
+}
+
 void cpu_exec_initfn(CPUState *cpu)
 {
     cpu->as = NULL;
diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@@ -XXX,XX +XXX,XX @@ static int64_t cpu_common_get_arch_id(CPUState *cpu)
     return cpu->cpu_index;
 }
 
-static Property cpu_common_props[] = {
-#ifndef CONFIG_USER_ONLY
-    /* Create a memory property for softmmu CPU object,
-     * so users can wire up its memory. (This can't go in hw/core/cpu.c
-     * because that file is compiled only once for both user-mode
-     * and system builds.) The default if no link is set up is to use
-     * the system address space.
-     */
-    DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
-                     MemoryRegion *),
-#endif
-    DEFINE_PROP_BOOL("start-powered-off", CPUState, start_powered_off, false),
-    DEFINE_PROP_END_OF_LIST(),
-};
-
 static void cpu_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -XXX,XX +XXX,XX @@ static void cpu_class_init(ObjectClass *klass, void *data)
     dc->realize = cpu_common_realizefn;
     dc->unrealize = cpu_common_unrealizefn;
     dc->reset = cpu_common_reset;
-    device_class_set_props(dc, cpu_common_props);
+    cpu_class_init_props(dc);
     /*
      * Reason: CPUs still need special care by board code: wiring up
      * IRQs, adding reset handlers, halting non-first CPUs, ...
-- 
2.25.1

The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:

Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027

for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:

tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)

----------------------------------------------------------------
Improvements to qemu/int128
Fixes for 128/64 division.
Cleanup tcg/optimize.c
Optimize redundant sign extensions

----------------------------------------------------------------
Frédéric Pétrot (1):
      qemu/int128: Add int128_{not,xor}

Luis Pires (4):
      host-utils: move checks out of divu128/divs128
      host-utils: move udiv_qrnnd() to host-utils
      host-utils: add 128-bit quotient support to divu128/divs128
      host-utils: add unit tests for divu128/divs128

Richard Henderson (51):
      tcg/optimize: Rename "mask" to "z_mask"
      tcg/optimize: Split out OptContext
      tcg/optimize: Remove do_default label
      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
      tcg/optimize: Move prev_mb into OptContext
      tcg/optimize: Split out init_arguments
      tcg/optimize: Split out copy_propagate
      tcg/optimize: Split out fold_call
      tcg/optimize: Drop nb_oargs, nb_iargs locals
      tcg/optimize: Change fail return for do_constant_folding_cond*
      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
      tcg/optimize: Split out finish_folding
      tcg/optimize: Use a boolean to avoid a mass of continues
      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
      tcg/optimize: Split out fold_const{1,2}
      tcg/optimize: Split out fold_setcond2
      tcg/optimize: Split out fold_brcond2
      tcg/optimize: Split out fold_brcond
      tcg/optimize: Split out fold_setcond
      tcg/optimize: Split out fold_mulu2_i32
      tcg/optimize: Split out fold_addsub2_i32
      tcg/optimize: Split out fold_movcond
      tcg/optimize: Split out fold_extract2
      tcg/optimize: Split out fold_extract, fold_sextract
      tcg/optimize: Split out fold_deposit
      tcg/optimize: Split out fold_count_zeros
      tcg/optimize: Split out fold_bswap
      tcg/optimize: Split out fold_dup, fold_dup2
      tcg/optimize: Split out fold_mov
      tcg/optimize: Split out fold_xx_to_i
      tcg/optimize: Split out fold_xx_to_x
      tcg/optimize: Split out fold_xi_to_i
      tcg/optimize: Add type to OptContext
      tcg/optimize: Split out fold_to_not
      tcg/optimize: Split out fold_sub_to_neg
      tcg/optimize: Split out fold_xi_to_x
      tcg/optimize: Split out fold_ix_to_i
      tcg/optimize: Split out fold_masks
      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
      tcg/optimize: Sink commutative operand swapping into fold functions
      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
      tcg/optimize: Use fold_xx_to_i for orc
      tcg/optimize: Use fold_xi_to_x for mul
      tcg/optimize: Use fold_xi_to_x for div
      tcg/optimize: Use fold_xx_to_i for rem
      tcg/optimize: Optimize sign extensions
      tcg/optimize: Propagate sign info for logical operations
      tcg/optimize: Propagate sign info for setcond
      tcg/optimize: Propagate sign info for bit counting
      tcg/optimize: Propagate sign info for shifting

From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>

Addition of not and xor on 128-bit integers.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
[rth: Split out logical operations.]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/int128.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return a;
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return ~a;
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return a | b;
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return a ^ b;
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return int128_make128(a, (a < 0) ? -1 : 0);
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return int128_make128(~a.lo, ~a.hi);
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     int64_t h;
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

In preparation for changing the divu128/divs128 implementations
to allow for quotients larger than 64 bits, move the div-by-zero
and overflow checks to the callers.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |  5 +++--
 include/qemu/host-utils.h | 34 ++++++++++++---------------------
 target/ppc/int_helper.c   | 14 +++++++++-----
 util/host-utils.c         | 40 ++++++++++++++++++---------------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
         return 0;
     }
     /*
-     * Ignore divu128() return value as we've caught div-by-zero and don't
-     * need different behaviour for overflow.
+     * BUG: when CONFIG_INT128 is not defined, the current implementation of
+     * divu128 does not return a valid truncated quotient, so the result will
+     * be wrong.
      */
     divu128(&lo, &hi, clk->period);
     return lo;
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
-        __uint128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result > UINT64_MAX;
-    }
+    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
+    __uint128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
-        __int128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result != *plow;
-    }
+    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
     uint64_t rt = 0;
     int overflow = 0;
 
-    overflow = divu128(&rt, &ra, rb);
-
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || ra >= rb)) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divu128(&rt, &ra, rb);
     }
 
     if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
     int64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
-    int overflow = divs128(&rt, &ra, rb);
+    int overflow = 0;
 
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divs128(&rt, &ra, rb);
     }
 
     if (oe) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
     *phigh = rh;
 }
 
-/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
-/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
-/* remainder via phigh. */
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+/*
+ * Unsigned 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
     unsigned i;
     uint64_t carry = 0;
 
-    if (divisor == 0) {
-        return 1;
-    } else if (dhi == 0) {
+    if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
         *phigh = dlo % divisor;
-        return 0;
-    } else if (dhi >= divisor) {
-        return 1;
     } else {
 
         for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 
         *plow = dlo;
         *phigh = dhi;
-        return 0;
     }
 }
 
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+/*
+ * Signed 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
     int sgn_dvdnd = *phigh < 0;
     int sgn_divsr = divisor < 0;
-    int overflow = 0;
 
     if (sgn_dvdnd) {
         *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
         divisor = 0 - divisor;
     }
 
-    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
+    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 
     if (sgn_dvdnd  ^ sgn_divsr) {
         *plow = 0 - *plow;
     }
-
-    if (!overflow) {
-        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
-            overflow = 1;
-        }
-    }
-
-    return overflow;
 }
 #endif
 
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
so it can be reused by divu128().

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-macros.h | 82 ----------------------------------
 include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
  * so some portions are provided under:
  *  the SoftFloat-2a license
  *  the BSD license
- *  GPL-v2-or-later
  *
  * Any future contributions to this file after December 1st 2014 will be
  * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* Portions of this work are licensed under the terms of the GNU GPL,
- * version 2 or later. See the COPYING file in the top-level directory.
- */
-
 #ifndef FPU_SOFTFLOAT_MACROS_H
 #define FPU_SOFTFLOAT_MACROS_H
 
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 
 }
 
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
- *
- * Licensed under the GPLv2/LGPLv3
- */
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
-                                  uint64_t n0, uint64_t d)
-{
-#if defined(__x86_64__)
-    uint64_t q;
-    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
-    return q;
-#elif defined(__s390x__) && !defined(__clang__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
-    /* From Power ISA 2.06, programming note for divdeu.  */
-    uint64_t q1, q2, Q, r1, r2, R;
-    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
-        : "=&r"(q1), "=r"(q2)
-        : "r"(n1), "r"(n0), "r"(d));
-    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
-    r2 = n0 - (q2 * d);
-    Q = q1 + q2;
-    R = r1 + r2;
-    if (R >= d || R < r2) { /* overflow implies R > d */
-        Q += 1;
-        R -= d;
-    }
-    *r = R;
-    return Q;
-#else
-    uint64_t d0, d1, q0, q1, r1, r0, m;
-
-    d0 = (uint32_t)d;
-    d1 = d >> 32;
-
-    r1 = n1 % d1;
-    q1 = n1 / d1;
-    m = q1 * d0;
-    r1 = (r1 << 32) | (n0 >> 32);
-    if (r1 < m) {
-        q1 -= 1;
-        r1 += d;
-        if (r1 >= d) {
-            if (r1 < m) {
-                q1 -= 1;
-                r1 += d;
-            }
-        }
-    }
-    r1 -= m;
-
-    r0 = r1 % d1;
-    q0 = r1 / d1;
-    m = q0 * d0;
-    r0 = (r0 << 32) | (uint32_t)n0;
-    if (r0 < m) {
-        q0 -= 1;
-        r0 += d;
-        if (r0 >= d) {
-            if (r0 < m) {
-                q0 -= 1;
-                r0 += d;
-            }
-        }
-    }
-    r0 -= m;
-
-    *r = r0;
-    return (q1 << 32) | q0;
-#endif
-}
-
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
  */
 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
+                                  uint64_t n0, uint64_t d)
+{
+#if defined(__x86_64__)
+    uint64_t q;
+    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
+    return q;
+#elif defined(__s390x__) && !defined(__clang__)
+    /* Need to use a TImode type to get an even register pair for DLGR.  */
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
+    *r = n >> 64;
+    return n;
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
+    /* From Power ISA 2.06, programming note for divdeu.  */
+    uint64_t q1, q2, Q, r1, r2, R;
+    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
+        : "=&r"(q1), "=r"(q2)
+        : "r"(n1), "r"(n0), "r"(d));
+    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
+    r2 = n0 - (q2 * d);
+    Q = q1 + q2;
+    R = r1 + r2;
+    if (R >= d || R < r2) { /* overflow implies R > d */
+        Q += 1;
+        R -= d;
+    }
+    *r = R;
+    return Q;
+#else
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    *r = r0;
+    return (q1 << 32) | q0;
+#endif
+}
+
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

These will be used to implement new decimal floating point
instructions from Power ISA 3.1.

The remainder is now returned directly by divu128/divs128,
freeing up phigh to receive the high 64 bits of the quotient.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |   6 +-
 include/qemu/host-utils.h |  20 ++++--
 target/ppc/int_helper.c   |   9 +--
 util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 60 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
     if (clk->period == 0) {
         return 0;
     }
-    /*
-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
-     * divu128 does not return a valid truncated quotient, so the result will
-     * be wrong.
-     */
+
     divu128(&lo, &hi, clk->period);
     return lo;
 }
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
+                               uint64_t divisor)
 {
     __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
     __uint128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
+                              int64_t divisor)
 {
-    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
     __int128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
 
 uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
 {
-    int64_t rt = 0;
+    uint64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
     int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
     int cr;
     uint64_t lo_value;
     uint64_t hi_value;
+    uint64_t rem;
     ppc_avr_t ret = { .u64 = { 0, 0 } };
 
     if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
          * In that case, we leave r unchanged.
          */
     } else {
-        divu128(&lo_value, &hi_value, 1000000000000000ULL);
+        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 
-        for (i = 1; i < 16; hi_value /= 10, i++) {
-            bcd_put_digit(&ret, hi_value % 10, i);
+        for (i = 1; i < 16; rem /= 10, i++) {
+            bcd_put_digit(&ret, rem % 10, i);
         }
 
         for (; i < 32; lo_value /= 10, i++) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
 }
 
 /*
- * Unsigned 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
-    unsigned i;
-    uint64_t carry = 0;
+    uint64_t rem, dhighest;
+    int sh;
 
     if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
-        *phigh = dlo % divisor;
+        *phigh = 0;
+        return dlo % divisor;
     } else {
+        sh = clz64(divisor);
 
-        for (i = 0; i < 64; i++) {
-            carry = dhi >> 63;
-            dhi = (dhi << 1) | (dlo >> 63);
-            if (carry || (dhi >= divisor)) {
-                dhi -= divisor;
-                carry = 1;
-            } else {
-                carry = 0;
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
             }
-            dlo = (dlo << 1) | carry;
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
         }
 
-        *plow = dlo;
-        *phigh = dhi;
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
     }
 }
 
 /*
- * Signed 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    int sgn_dvdnd = *phigh < 0;
-    int sgn_divsr = divisor < 0;
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
 
-    if (sgn_dvdnd) {
-        *plow = ~(*plow);
-        *phigh = ~(*phigh);
-        if (*plow == (int64_t)-1) {
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
             *plow = 0;
-            (*phigh)++;
-         } else {
-            (*plow)++;
-         }
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
     }
 
-    if (sgn_divsr) {
-        divisor = 0 - divisor;
-    }
-
-    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
-
-    if (sgn_dvdnd  ^ sgn_divsr) {
-        *plow = 0 - *plow;
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
     }
 }
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
 tests/unit/meson.build   |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 tests/unit/test-div128.c

diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test 128-bit division functions
+ *
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+typedef struct {
+    uint64_t high;
+    uint64_t low;
+    uint64_t rhigh;
+    uint64_t rlow;
+    uint64_t divisor;
+    uint64_t remainder;
+} test_data_unsigned;
+
+typedef struct {
+    int64_t high;
+    uint64_t low;
+    int64_t rhigh;
+    uint64_t rlow;
+    int64_t divisor;
+    int64_t remainder;
+} test_data_signed;
+
+static const test_data_unsigned test_table_unsigned[] = {
+    /* Dividend fits in 64 bits */
+    { 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000003ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000002ULL, 0x0000000000000001ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0xa000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000002ULL,
+      0x4000000000000000ULL, 0x2000000000000000ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x8000000000000000ULL, 0x0000000000000000ULL},
+
+    /* Dividend > 64 bits, with MSB 0 */
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x000000000000000dULL,
+      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
+      0x0000000000000010ULL, 0x0000000000000001ULL},
+
+    /* Dividend > 64 bits, with MSB 1 */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
+      0x0000000000000010ULL, 0x000000000000000fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
+      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
+
+    /**
+     * Divisor == 64 bits, with MSB 1
+     * and high 64 bits of dividend >= divisor
+     * (for testing normalization)
+     */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0xfddbb9977553310aULL,
+      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
+
+    /* Dividend > 64 bits, divisor almost as big */
+    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
+      0x0000000000000000ULL, 0x000000000000000fULL,
+      0x123456789abcdefeULL, 0x123456789abcde1fULL},
+};
+
+static const test_data_signed test_table_signed[] = {
+    /* Positive dividend, positive/negative divisors */
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0x0000000000000008LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0xfffffffffffffff8LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0x0000000000000237LL, 0x0000000000000183LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0xfffffffffffffdc9LL, 0x0000000000000183LL},
+
+    /* Negative dividend, positive/negative divisors */
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0x0000000000000008LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0x0000000000000237LL, 0xfffffffffffffe7dLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
+};
+
+static void test_divu128(void)
+{
+    int i;
+    uint64_t rem;
+    test_data_unsigned tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
+        tmp = test_table_unsigned[i];
+
+        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+static void test_divs128(void)
+{
+    int i;
+    int64_t rem;
+    test_data_signed tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
+        tmp = test_table_signed[i];
+
+        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/host-utils/test_divu128", test_divu128);
+    g_test_add_func("/host-utils/test_divs128", test_divs128);
+    return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
   # all code tested by test-x86-cpuid is inside topology.h
   'test-x86-cpuid': [],
   'test-cutils': [],
+  'test-div128': [],
   'test-shift128': [],
   'test-mul64': [],
   # all code tested by test-int128 is inside int128.h
-- 
2.25.1

Prepare for tracking different masks by renaming this one.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     uint64_t val;
-    uint64_t mask;
+    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->next_copy = ts;
     ti->prev_copy = ts;
     ti->is_const = false;
-    ti->mask = -1;
+    ti->z_mask = -1;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
-        ti->mask = ts->val;
+        ti->z_mask = ts->val;
         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
             /* High bits of a 32-bit quantity are garbage.  */
-            ti->mask |= ~0xffffffffull;
+            ti->z_mask |= ~0xffffffffull;
         }
     } else {
         ti->is_const = false;
-        ti->mask = -1;
+        ti->z_mask = -1;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t mask;
+    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    mask = si->mask;
+    z_mask = si->z_mask;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
         /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
+        z_mask |= ~0xffffffffull;
     }
-    di->mask = mask;
+    di->z_mask = z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t mask, partmask, affected, tmp;
+        uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
-        mask = -1;
+        z_mask = -1;
         affected = -1;
         switch (opc) {
         CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext8u):
-            mask = 0xff;
+            z_mask = 0xff;
             goto and_const;
         CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             goto and_const;
         case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_ext32u_i64:
-            mask = 0xffffffffU;
+            z_mask = 0xffffffffU;
             goto and_const;
 
         CASE_OP_32_64(and):
-            mask = arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[2])->z_mask;
             if (arg_is_const(op->args[2])) {
         and_const:
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
-            mask = arg_info(op->args[1])->mask & mask;
+            z_mask = arg_info(op->args[1])->z_mask & z_mask;
             break;
 
         case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_extu_i32_i64:
             /* We do not compute affected as it is a size changing op.  */
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
 
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                op->args[2] is constant, we can't infer anything from it.  */
             if (arg_is_const(op->args[2])) {
-                mask = ~arg_info(op->args[2])->mask;
+                z_mask = ~arg_info(op->args[2])->z_mask;
                 goto and_const;
             }
             /* But we certainly know nothing outside args[1] may be set. */
-            mask = arg_info(op->args[1])->mask;
+            z_mask = arg_info(op->args[1])->z_mask;
             break;
 
         case INDEX_op_sar_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_extrl_i64_i32:
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
         case INDEX_op_extrh_i64_i32:
-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
+            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                mask = arg_info(op->args[1])->mask << tmp;
+                z_mask = arg_info(op->args[1])->z_mask << tmp;
             }
             break;
 
         CASE_OP_32_64(neg):
             /* Set to 1 all bits to the left of the rightmost.  */
-            mask = -(arg_info(op->args[1])->mask
-                     & -arg_info(op->args[1])->mask);
+            z_mask = -(arg_info(op->args[1])->z_mask
+                       & -arg_info(op->args[1])->z_mask);
             break;
 
         CASE_OP_32_64(deposit):
-            mask = deposit64(arg_info(op->args[1])->mask,
-                             op->args[3], op->args[4],
-                             arg_info(op->args[2])->mask);
+            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                               op->args[3], op->args[4],
+                               arg_info(op->args[2])->z_mask);
             break;
 
         CASE_OP_32_64(extract):
-            mask = extract64(arg_info(op->args[1])->mask,
-                             op->args[2], op->args[3]);
+            z_mask = extract64(arg_info(op->args[1])->z_mask,
+                               op->args[2], op->args[3]);
             if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
         CASE_OP_32_64(sextract):
-            mask = sextract64(arg_info(op->args[1])->mask,
-                              op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+                                op->args[2], op->args[3]);
+            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
 
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[1])->z_mask
+                   | arg_info(op->args[2])->z_mask;
             break;
 
         case INDEX_op_clz_i32:
         case INDEX_op_ctz_i32:
-            mask = arg_info(op->args[2])->mask | 31;
+            z_mask = arg_info(op->args[2])->z_mask | 31;
             break;
 
         case INDEX_op_clz_i64:
         case INDEX_op_ctz_i64:
-            mask = arg_info(op->args[2])->mask | 63;
+            z_mask = arg_info(op->args[2])->z_mask | 63;
             break;
 
         case INDEX_op_ctpop_i32:
-            mask = 32 | 31;
+            z_mask = 32 | 31;
             break;
         case INDEX_op_ctpop_i64:
-            mask = 64 | 63;
+            z_mask = 64 | 63;
             break;
 
         CASE_OP_32_64(setcond):
         case INDEX_op_setcond2_i32:
-            mask = 1;
+            z_mask = 1;
             break;
 
         CASE_OP_32_64(movcond):
-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
+            z_mask = arg_info(op->args[3])->z_mask
+                   | arg_info(op->args[4])->z_mask;
             break;
 
         CASE_OP_32_64(ld8u):
-            mask = 0xff;
+            z_mask = 0xff;
             break;
         CASE_OP_32_64(ld16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             break;
         case INDEX_op_ld32u_i64:
-            mask = 0xffffffffu;
+            z_mask = 0xffffffffu;
             break;
 
         CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
             }
             break;
 
         CASE_OP_32_64(bswap16):
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffff) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffff) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap16(mask);
+            z_mask = bswap16(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int16_t)mask;
+                z_mask = (int16_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(16, 48);
+                z_mask |= MAKE_64BIT_MASK(16, 48);
                 break;
             }
             break;
 
         case INDEX_op_bswap32_i64:
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffffffffu) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffffffffu) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap32(mask);
+            z_mask = bswap32(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int32_t)mask;
+                z_mask = (int32_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(32, 32);
+                z_mask |= MAKE_64BIT_MASK(32, 32);
                 break;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         /* 32-bit ops generate 32-bit results.  For the result is zero test
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
-        partmask = mask;
+        partmask = z_mask;
         if (!(def->flags & TCG_OPF_64BIT)) {
-            mask |= ~(tcg_target_ulong)0xffffffffu;
+            z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                    vs the high word of the input.  */
             do_setcond_high:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             /* Default case: we know nothing about operation (or were unable
                to compute the operation result) so no propagation is done.
                We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "mask" is
+               block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
                 memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
                     if (i == 0) {
-                        arg_info(op->args[i])->mask = mask;
+                        arg_info(op->args[i])->z_mask = z_mask;
                     }
                 }
             }
-- 
2.25.1

Provide what will become a larger context for splitting
the very large tcg_optimize function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
+typedef struct OptContext {
+    TCGTempSet temps_used;
+} OptContext;
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     TempOptInfo *ti;
 
-    if (test_bit(idx, temps_used->l)) {
+    if (test_bit(idx, ctx->temps_used.l)) {
         return;
     }
-    set_bit(idx, temps_used->l);
+    set_bit(idx, ctx->temps_used.l);
 
     ti = ts->state_ptr;
     if (ti == NULL) {
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(OptContext *ctx, TCGArg arg)
 {
-    init_ts_info(temps_used, arg_temp(arg));
+    init_ts_info(ctx, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                              TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
-    init_ts_info(temps_used, tv);
+    init_ts_info(ctx, tv);
     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TCGTempSet temps_used;
+    OptContext ctx = {};
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(&temps_used, ts);
+                    init_ts_info(&ctx, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&temps_used, op->args[i]);
+                init_arg_info(&ctx, op->args[i]);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, temps_used.l)) {
+                    if (test_bit(i, ctx.temps_used.l)) {
                         reset_ts(&s->temps[i]);
                     }
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Break the final cleanup clause out of the main switch
statement.  When fully folding an opcode to mov/movi,
use "continue" to process the next opcode, else break
to fall into the final cleanup.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 96 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-            break;
+            continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
-                break;
+                continue;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
                 nb_iargs = 1;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(deposit):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract):
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(sextract):
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                     ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(brcond):
             tmp = do_constant_folding_cond(opc, op->args[0],
                                            op->args[1], op->args[2]);
-            if (tmp != 2) {
-                if (tmp) {
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[3];
-                } else {
-                    tcg_op_remove(s, op);
-                }
+            switch (tmp) {
+            case 0:
+                tcg_op_remove(s, op);
+                continue;
+            case 1:
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[3];
                 break;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
-                break;
+                continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                 uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-                    goto do_default;
+                    break;
                 }
                 op->args[3] = cond;
                 op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  : INDEX_op_setcond_i64);
                 nb_iargs = 2;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_mulu2_i32:
             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                             op->args[4]);
-            if (tmp != 2) {
-                if (tmp) {
-            do_brcond_true:
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[5];
-                } else {
+            if (tmp == 0) {
             do_brcond_false:
-                    tcg_op_remove(s, op);
-                }
-            } else if ((op->args[4] == TCG_COND_LT
-                        || op->args[4] == TCG_COND_GE)
-                       && arg_is_const(op->args[2])
-                       && arg_info(op->args[2])->val == 0
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0) {
+                tcg_op_remove(s, op);
+                continue;
+            }
+            if (tmp == 1) {
+            do_brcond_true:
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[5];
+                break;
+            }
+            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+                 && arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
+                op->opc = opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_brcond_false;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_brcond_low:
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_brcond_true;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
             do_setcond_const:
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-            } else if ((op->args[5] == TCG_COND_LT
-                        || op->args[5] == TCG_COND_GE)
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0
-                       && arg_is_const(op->args[4])
-                       && arg_info(op->args[4])->val == 0) {
+                continue;
+            }
+            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0
+                 && arg_is_const(op->args[4])
+                 && arg_info(op->args[4])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_setcond_high;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_setcond_const;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
-        case INDEX_op_call:
-            if (!(tcg_call_flags(op)
+        default:
+            break;
+        }
+
+        /* Some of the folding above can change opc. */
+        opc = op->opc;
+        def = &tcg_op_defs[opc];
+        if (def->flags & TCG_OPF_BB_END) {
+            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+        } else {
+            if (opc == INDEX_op_call &&
+                !(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            goto do_reset_output;
 
-        default:
-        do_default:
-            /* Default case: we know nothing about operation (or were unable
-               to compute the operation result) so no propagation is done.
-               We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "z_mask" is
-               the non-zero bits mask for the first output arg.  */
-            if (def->flags & TCG_OPF_BB_END) {
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-            } else {
-        do_reset_output:
-                for (i = 0; i < nb_oargs; i++) {
-                    reset_temp(op->args[i]);
-                    /* Save the corresponding known-zero bits mask for the
-                       first output argument (only one supported so far). */
-                    if (i == 0) {
-                        arg_info(op->args[i])->z_mask = z_mask;
-                    }
+            for (i = 0; i < nb_oargs; i++) {
+                reset_temp(op->args[i]);
+                /* Save the corresponding known-zero bits mask for the
+                   first output argument (only one supported so far). */
+                if (i == 0) {
+                    arg_info(op->args[i])->z_mask = z_mask;
                 }
             }
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1

Adjust the interface to take the OptContext parameter instead
of TCGContext or both.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 } TempOptInfo;
 
 typedef struct OptContext {
+    TCGContext *tcg;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
-        tcg_op_remove(s, op);
+        tcg_op_remove(ctx->tcg, op);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-                             TCGOp *op, TCGArg dst, uint64_t val)
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                             TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    OptContext ctx = {};
+    OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(or):
         CASE_OP_32_64_VEC(and):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 } else {
-                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                 }
                 continue;
             }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-- 
2.25.1

This will expose the variable to subroutines that
will be broken out of tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 
 typedef struct OptContext {
     TCGContext *tcg;
+    TCGOp *prev_mb;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
-    TCGOp *op, *op_next, *prev_mb = NULL;
+    TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-        if (prev_mb) {
+        if (ctx.prev_mb) {
             switch (opc) {
             case INDEX_op_mb:
                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  * barrier.  This is stricter than specified but for
                  * the purposes of TCG is better than not optimizing.
                  */
-                prev_mb->args[0] |= op->args[0];
+                ctx.prev_mb->args[0] |= op->args[0];
                 tcg_op_remove(s, op);
                 break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
-                prev_mb = NULL;
+                ctx.prev_mb = NULL;
                 break;
             }
         } else if (opc == INDEX_op_mb) {
-            prev_mb = op;
+            ctx.prev_mb = op;
         }
     }
 }
-- 
2.25.1

There was no real reason for calls to have separate code here.
Unify init for calls vs non-calls using the call path, which
handles TCG_CALL_DUMMY_ARG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(OptContext *ctx, TCGArg arg)
-{
-    init_ts_info(ctx, arg_temp(arg));
-}
-
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+{
+    for (int i = 0; i < nb_args; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts) {
+            init_ts_info(ctx, ts);
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (opc == INDEX_op_call) {
             nb_oargs = TCGOP_CALLO(op);
             nb_iargs = TCGOP_CALLI(op);
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                TCGTemp *ts = arg_temp(op->args[i]);
-                if (ts) {
-                    init_ts_info(&ctx, ts);
-                }
-            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&ctx, op->args[i]);
-            }
         }
+        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-- 
2.25.1

Continue splitting tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
     }
 }
 
+static void copy_propagate(OptContext *ctx, TCGOp *op,
+                           int nb_oargs, int nb_iargs)
+{
+    TCGContext *s = ctx->tcg;
+
+    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts && ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(s, ts));
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             nb_iargs = def->nb_iargs;
         }
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-
-        /* Do copy propagation */
-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            TCGTemp *ts = arg_temp(op->args[i]);
-            if (ts && ts_is_copy(ts)) {
-                op->args[i] = temp_arg(find_better_copy(s, ts));
-            }
-        }
+        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
-- 
2.25.1

Calls are special in that they have a variable number
of arguments, and need to be able to clobber globals.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static bool fold_call(OptContext *ctx, TCGOp *op)
+{
+    TCGContext *s = ctx->tcg;
+    int nb_oargs = TCGOP_CALLO(op);
+    int nb_iargs = TCGOP_CALLI(op);
+    int flags, i;
+
+    init_arguments(ctx, op, nb_oargs + nb_iargs);
+    copy_propagate(ctx, op, nb_oargs, nb_iargs);
+
+    /* If the function reads or writes globals, reset temp data. */
+    flags = tcg_call_flags(op);
+    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+        int nb_globals = s->nb_globals;
+
+        for (i = 0; i < nb_globals; i++) {
+            if (test_bit(i, ctx->temps_used.l)) {
+                reset_ts(&ctx->tcg->temps[i]);
+            }
+        }
+    }
+
+    /* Reset temp data for outputs. */
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+    }
+
+    /* Stop optimizing MB across calls. */
+    ctx->prev_mb = NULL;
+    return true;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals, i;
+    int nb_temps, i;
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
        available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
-    nb_globals = s->nb_globals;
-
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
 
-        /* Count the arguments, and initialize the temps that are
-           going to be used */
+        /* Calls are special. */
         if (opc == INDEX_op_call) {
-            nb_oargs = TCGOP_CALLO(op);
-            nb_iargs = TCGOP_CALLI(op);
-        } else {
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
+            fold_call(&ctx, op);
+            continue;
         }
+
+        def = &tcg_op_defs[opc];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
-            if (opc == INDEX_op_call &&
-                !(tcg_call_flags(op)
-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-                for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, ctx.temps_used.l)) {
-                        reset_ts(&s->temps[i]);
-                    }
-                }
-            }
-
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
-            case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
                 ctx.prev_mb = NULL;
                 break;
-- 
2.25.1

Rather than try to keep these up-to-date across folding,
re-read nb_oargs at the end, after re-reading the opcode.

A couple of asserts need dropping, but that will take care
of itself as we split the function further.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t z_mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         def = &tcg_op_defs[opc];
-        nb_oargs = def->nb_oargs;
-        nb_iargs = def->nb_iargs;
-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
+        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(qemu_ld):
             {
-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         if (partmask == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
-                nb_iargs = 1;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = opc = (opc == INDEX_op_movcond_i32
                                  ? INDEX_op_setcond_i32
                                  : INDEX_op_setcond_i64);
-                nb_iargs = 2;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
+            int nb_oargs = def->nb_oargs;
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
-- 
2.25.1

Return -1 instead of 2 for failure, so that we can
use comparisons against 0 for all cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
     }
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                       TCGArg y, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                    TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         case TCG_COND_GEU:
             return 1;
         default:
-            return 2;
+            return -1;
         }
     }
-    return 2;
+    return -1;
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 {
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
         return do_constant_folding_cond_eq(c);
     }
-    return 2;
+    return -1;
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[3]);
-            if (tmp != 2) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[3]);
+            if (i >= 0) {
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             break;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(opc, op->args[0],
-                                           op->args[1], op->args[2]);
-            switch (tmp) {
-            case 0:
+            i = do_constant_folding_cond(opc, op->args[0],
+                                         op->args[1], op->args[2]);
+            if (i == 0) {
                 tcg_op_remove(s, op);
                 continue;
-            case 1:
+            } else if (i > 0) {
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[5]);
-            if (tmp != 2) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[5]);
+            if (i >= 0) {
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_brcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                            op->args[4]);
-            if (tmp == 0) {
+            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                          op->args[4]);
+            if (i == 0) {
             do_brcond_false:
                 tcg_op_remove(s, op);
                 continue;
             }
-            if (tmp == 1) {
+            if (i > 0) {
             do_brcond_true:
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
             }
             break;
 
         case INDEX_op_setcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                            op->args[5]);
-            if (tmp != 2) {
+            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+                                          op->args[5]);
+            if (i >= 0) {
             do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_const;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
             }
-- 
2.25.1

This will allow callers to tail call to these functions
and return true indicating processing complete.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 
     if (ts_are_copies(dst_ts, src_ts)) {
         tcg_op_remove(ctx->tcg, op);
-        return;
+        return true;
     }
 
     reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         di->is_const = si->is_const;
         di->val = si->val;
     }
+    return true;
 }
 
-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
-- 
2.25.1

Copy z_mask into OptContext, for writeback to the
first output within the new function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGContext *tcg;
     TCGOp *prev_mb;
     TCGTempSet temps_used;
+
+    /* In flight values from optimization. */
+    uint64_t z_mask;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_folding(OptContext *ctx, TCGOp *op)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    int i, nb_oargs;
+
+    /*
+     * For an opcode that ends a BB, reset all temp data.
+     * We do no cross-BB optimization.
+     */
+    if (def->flags & TCG_OPF_BB_END) {
+        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        ctx->prev_mb = NULL;
+        return;
+    }
+
+    nb_oargs = def->nb_oargs;
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+        /*
+         * Save the corresponding known-zero bits mask for the
+         * first output argument (only one supported so far).
+         */
+        if (i == 0) {
+            arg_info(op->args[i])->z_mask = ctx->z_mask;
+        }
+    }
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
+        ctx.z_mask = z_mask;
 
         if (partmask == 0) {
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Some of the folding above can change opc. */
-        opc = op->opc;
-        def = &tcg_op_defs[opc];
-        if (def->flags & TCG_OPF_BB_END) {
-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-        } else {
-            int nb_oargs = def->nb_oargs;
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /* Save the corresponding known-zero bits mask for the
-                   first output argument (only one supported so far). */
-                if (i == 0) {
-                    arg_info(op->args[i])->z_mask = z_mask;
-                }
-            }
-        }
+        finish_folding(&ctx, op);
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
+        bool done = false;
 
         /* Calls are special. */
         if (opc == INDEX_op_call) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
+            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        finish_folding(&ctx, op);
+        if (!done) {
+            finish_folding(&ctx, op);
+        }
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

This puts the separate mb optimization into the same framework
as the others.  While fold_qemu_{ld,st} are currently identical,
that won't last as more code gets moved.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mb(OptContext *ctx, TCGOp *op)
+{
+    /* Eliminate duplicate and redundant fence instructions.  */
+    if (ctx->prev_mb) {
+        /*
+         * Merge two barriers of the same type into one,
+         * or a weaker barrier into a stronger one,
+         * or two weaker barriers into a stronger one.
+         *   mb X; mb Y => mb X|Y
+         *   mb; strl => mb; st
+         *   ldaq; mb => ld; mb
+         *   ldaq; strl => ld; mb; st
+         * Other combinations are also merged into a strong
+         * barrier.  This is stricter than specified but for
+         * the purposes of TCG is better than not optimizing.
+         */
+        ctx->prev_mb->args[0] |= op->args[0];
+        tcg_op_remove(ctx->tcg, op);
+    } else {
+        ctx->prev_mb = op;
+    }
+    return true;
+}
+
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        case INDEX_op_mb:
+            done = fold_mb(&ctx, op);
+            break;
+        case INDEX_op_qemu_ld_i32:
+        case INDEX_op_qemu_ld_i64:
+            done = fold_qemu_ld(&ctx, op);
+            break;
+        case INDEX_op_qemu_st_i32:
+        case INDEX_op_qemu_st8_i32:
+        case INDEX_op_qemu_st_i64:
+            done = fold_qemu_st(&ctx, op);
+            break;
+
         default:
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (!done) {
             finish_folding(&ctx, op);
         }
-
-        /* Eliminate duplicate and redundant fence instructions.  */
-        if (ctx.prev_mb) {
-            switch (opc) {
-            case INDEX_op_mb:
-                /* Merge two barriers of the same type into one,
-                 * or a weaker barrier into a stronger one,
-                 * or two weaker barriers into a stronger one.
-                 *   mb X; mb Y => mb X|Y
-                 *   mb; strl => mb; st
-                 *   ldaq; mb => ld; mb
-                 *   ldaq; strl => ld; mb; st
-                 * Other combinations are also merged into a strong
-                 * barrier.  This is stricter than specified but for
-                 * the purposes of TCG is better than not optimizing.
-                 */
-                ctx.prev_mb->args[0] |= op->args[0];
-                tcg_op_remove(s, op);
-                break;
-
-            default:
-                /* Opcodes that end the block stop the optimization.  */
-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-                    break;
-                }
-                /* fallthru */
-            case INDEX_op_qemu_ld_i32:
-            case INDEX_op_qemu_ld_i64:
-            case INDEX_op_qemu_st_i32:
-            case INDEX_op_qemu_st8_i32:
-            case INDEX_op_qemu_st_i64:
-                /* Opcodes that touch guest memory stop the optimization.  */
-                ctx.prev_mb = NULL;
-                break;
-            }
-        } else if (opc == INDEX_op_mb) {
-            ctx.prev_mb = op;
-        }
     }
 }
-- 
2.25.1

Split out a whole bunch of placeholder functions, which are
currently identical.  That won't last as more code gets moved.

Use CASE_32_64_VEC for some logical operators that previously
missed the addition of vectors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 219 insertions(+), 52 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     }
 }
 
+/*
+ * The fold_* functions return true when processing is complete,
+ * usually by folding the operation to a constant or to a copy,
+ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
+ * like collect information about the value produced, for use in
+ * optimizing a subsequent operation.
+ *
+ * These first fold_* functions are all helpers, used by other
+ * folders for more specific operations.
+ */
+
+static bool fold_const1(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = do_constant_folding(op->opc, t, 0);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_const2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = do_constant_folding(op->opc, t1, t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
+/*
+ * These outermost fold_<op> functions are sorted alphabetically.
+ */
+
+static bool fold_add(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_and(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_andc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_divide(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_exts(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_extu(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
 static bool fold_mb(OptContext *ctx, TCGOp *op)
 {
     /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mul(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_nand(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_neg(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_nor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_not(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_or(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_orc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_shift(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_xor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(not):
-        CASE_OP_32_64(neg):
-        CASE_OP_32_64(ext8s):
-        CASE_OP_32_64(ext8u):
-        CASE_OP_32_64(ext16s):
-        CASE_OP_32_64(ext16u):
-        CASE_OP_32_64(ctpop):
-        case INDEX_op_ext32s_i64:
-        case INDEX_op_ext32u_i64:
-        case INDEX_op_ext_i32_i64:
-        case INDEX_op_extu_i32_i64:
-        case INDEX_op_extrl_i64_i32:
-        case INDEX_op_extrh_i64_i32:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
         case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(add):
-        CASE_OP_32_64(sub):
-        CASE_OP_32_64(mul):
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(and):
-        CASE_OP_32_64(xor):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-        CASE_OP_32_64(andc):
-        CASE_OP_32_64(orc):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-        CASE_OP_32_64(div):
-        CASE_OP_32_64(divu):
-        CASE_OP_32_64(rem):
-        CASE_OP_32_64(remu):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        default:
+            break;
+
+        /* ---------------------------------------------------------- */
+        /* Sorted alphabetically by opcode as much as possible. */
+
+        CASE_OP_32_64_VEC(add):
+            done = fold_add(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(and):
+            done = fold_and(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(andc):
+            done = fold_andc(&ctx, op);
+            break;
+        CASE_OP_32_64(ctpop):
+            done = fold_ctpop(&ctx, op);
+            break;
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+            done = fold_divide(&ctx, op);
+            break;
+        CASE_OP_32_64(eqv):
+            done = fold_eqv(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext16s):
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext_i32_i64:
+            done = fold_exts(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16u):
+        case INDEX_op_ext32u_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            done = fold_extu(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(mul):
+            done = fold_mul(&ctx, op);
+            break;
+        CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(muluh):
+            done = fold_mul_highpart(&ctx, op);
+            break;
+        CASE_OP_32_64(nand):
+            done = fold_nand(&ctx, op);
+            break;
+        CASE_OP_32_64(neg):
+            done = fold_neg(&ctx, op);
+            break;
+        CASE_OP_32_64(nor):
+            done = fold_nor(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(not):
+            done = fold_not(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(or):
+            done = fold_or(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(orc):
+            done = fold_orc(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_i32:
         case INDEX_op_qemu_ld_i64:
             done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_qemu_st_i64:
             done = fold_qemu_st(&ctx, op);
             break;
-
-        default:
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
+            done = fold_remainder(&ctx, op);
+            break;
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+            done = fold_shift(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(sub):
+            done = fold_sub(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(xor):
+            done = fold_xor(&ctx, op);
             break;
         }
 
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_setcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
+            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
+            goto do_setcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            goto do_setcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+                                     op->args[4], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            op->args[2] = op->args[3];
+            op->args[3] = cond;
+            op->opc = INDEX_op_setcond_i32;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_setcond_high:
+        op->args[1] = op->args[2];
+        op->args[2] = op->args[4];
+        op->args[3] = cond;
+        op->opc = INDEX_op_setcond_i32;
+        break;
+    }
+    return false;
+
+ do_setcond_const:
+    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_setcond2_i32:
-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                          op->args[5]);
-            if (i >= 0) {
-            do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-                continue;
-            }
-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0
-                 && arg_is_const(op->args[4])
-                 && arg_info(op->args[4])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_setcond_high:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_const;
-                } else if (i > 0) {
-                    goto do_setcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i < 0) {
-                    break;
-                }
-            do_setcond_low:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[2] = op->args[3];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_low;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(shr):
             done = fold_shift(&ctx, op);
             break;
+        case INDEX_op_setcond2_i32:
+            done = fold_setcond2(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[4];
+    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
+    TCGArg label = op->args[5];
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_brcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
+            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
+            goto do_brcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+                                     op->args[2], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            goto do_brcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            op->opc = INDEX_op_brcond_i32;
+            op->args[1] = op->args[2];
+            op->args[2] = cond;
+            op->args[3] = label;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_brcond_high:
+        op->opc = INDEX_op_brcond_i32;
+        op->args[0] = op->args[1];
+        op->args[1] = op->args[3];
+        op->args[2] = cond;
+        op->args[3] = label;
+        break;
+
+    do_brcond_const:
+        if (i == 0) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+        op->opc = INDEX_op_br;
+        op->args[0] = label;
+        break;
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_brcond2_i32:
-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                          op->args[4]);
-            if (i == 0) {
-            do_brcond_false:
-                tcg_op_remove(s, op);
-                continue;
-            }
-            if (i > 0) {
-            do_brcond_true:
-                op->opc = opc = INDEX_op_br;
-                op->args[0] = op->args[5];
-                break;
-            }
-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
-                 && arg_is_const(op->args[2])
-                 && arg_info(op->args[2])->val == 0
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_brcond_high:
-                op->opc = opc = INDEX_op_brcond_i32;
-                op->args[0] = op->args[1];
-                op->args[1] = op->args[3];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i > 0) {
-                    goto do_brcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i < 0) {
-                    break;
-                }
-            do_brcond_low:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_high;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_low;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(andc):
             done = fold_andc(&ctx, op);
             break;
+        case INDEX_op_brcond2_i32:
+            done = fold_brcond2(&ctx, op);
+            break;
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint32_t a = arg_info(op->args[2])->val;
+        uint32_t b = arg_info(op->args[3])->val;
+        uint64_t r = (uint64_t)a * b;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+        return true;
+    }
+    return false;
+}
+
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_mulu2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-                uint32_t a = arg_info(op->args[2])->val;
-                uint32_t b = arg_info(op->args[3])->val;
-                uint64_t r = (uint64_t)a * b;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
+        case INDEX_op_mulu2_i32:
+            done = fold_mulu2_i32(&ctx, op);
+            break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
             break;
-- 
2.25.1

Add two additional helpers, fold_add2_i32 and fold_sub2_i32
which will not be simple wrappers forever.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+        uint32_t al = arg_info(op->args[2])->val;
+        uint32_t ah = arg_info(op->args[3])->val;
+        uint32_t bl = arg_info(op->args[4])->val;
+        uint32_t bh = arg_info(op->args[5])->val;
+        uint64_t a = ((uint64_t)ah << 32) | al;
+        uint64_t b = ((uint64_t)bh << 32) | bl;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        if (add) {
+            a += b;
+        } else {
+            a -= b;
+        }
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+        return true;
+    }
+    return false;
+}
+
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, true);
+}
+
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, false);
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_add2_i32:
-        case INDEX_op_sub2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-                uint32_t al = arg_info(op->args[2])->val;
-                uint32_t ah = arg_info(op->args[3])->val;
-                uint32_t bl = arg_info(op->args[4])->val;
-                uint32_t bh = arg_info(op->args[5])->val;
-                uint64_t a = ((uint64_t)ah << 32) | al;
-                uint64_t b = ((uint64_t)bh << 32) | bl;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                if (opc == INDEX_op_add2_i32) {
-                    a += b;
-                } else {
-                    a -= b;
-                }
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-                continue;
-            }
-            break;
 
         default:
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add2_i32:
+            done = fold_add2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub2_i32:
+            done = fold_sub2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode opc = op->opc;
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+
+    if (i >= 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+    }
+
+    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+        uint64_t tv = arg_info(op->args[3])->val;
+        uint64_t fv = arg_info(op->args[4])->val;
+
+        opc = (opc == INDEX_op_movcond_i32
+               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+
+        if (tv == 1 && fv == 0) {
+            op->opc = opc;
+            op->args[3] = cond;
+        } else if (fv == 1 && tv == 0) {
+            op->opc = opc;
+            op->args[3] = tcg_invert_cond(cond);
+        }
+    }
+    return false;
+}
+
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(movcond):
-            i = do_constant_folding_cond(opc, op->args[1],
-                                         op->args[2], op->args[5]);
-            if (i >= 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-                continue;
-            }
-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                uint64_t tv = arg_info(op->args[3])->val;
-                uint64_t fv = arg_info(op->args[4])->val;
-                TCGCond cond = op->args[5];
-
-                if (fv == 1 && tv == 0) {
-                    cond = tcg_invert_cond(cond);
-                } else if (!(tv == 1 && fv == 0)) {
-                    break;
-                }
-                op->args[3] = cond;
-                op->opc = opc = (opc == INDEX_op_movcond_i32
-                                 ? INDEX_op_setcond_i32
-                                 : INDEX_op_setcond_i64);
-            }
-            break;
-
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(movcond):
+            done = fold_movcond(&ctx, op);
+            break;
         CASE_OP_32_64(mul):
             done = fold_mul(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t v1 = arg_info(op->args[1])->val;
+        uint64_t v2 = arg_info(op->args[2])->val;
+        int shr = op->args[3];
+
+        if (op->opc == INDEX_op_extract2_i64) {
+            v1 >>= shr;
+            v2 <<= 64 - shr;
+        } else {
+            v1 = (uint32_t)v1 >> shr;
+            v2 = (int32_t)v2 << (32 - shr);
+        }
+        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+    }
+    return false;
+}
+
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
     return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract2):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                uint64_t v1 = arg_info(op->args[1])->val;
-                uint64_t v2 = arg_info(op->args[2])->val;
-                int shr = op->args[3];
-
-                if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-                } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-                                    ((uint32_t)v2 << (32 - shr)));
-                }
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract2):
+            done = fold_extract2(&ctx, op);
+            break;
         CASE_OP_32_64(ext8s):
         CASE_OP_32_64(ext16s):
         case INDEX_op_ext32s_i64:
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = extract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = sextract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract):
-            if (arg_is_const(op->args[1])) {
-                tmp = extract64(arg_info(op->args[1])->val,
-                                op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        CASE_OP_32_64(sextract):
-            if (arg_is_const(op->args[1])) {
-                tmp = sextract64(arg_info(op->args[1])->val,
-                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract):
+            done = fold_extract(&ctx, op);
+            break;
         CASE_OP_32_64(extract2):
             done = fold_extract2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_setcond2_i32:
             done = fold_setcond2(&ctx, op);
             break;
+        CASE_OP_32_64(sextract):
+            done = fold_sextract(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     return fold_const1(ctx, op);
 }
 
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
 static bool fold_divide(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(deposit):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = deposit64(arg_info(op->args[1])->val,
-                                op->args[3], op->args[4],
-                                arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
+        CASE_OP_32_64(deposit):
+            done = fold_deposit(&ctx, op);
+            break;
         CASE_OP_32_64(div):
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+
+        t = do_constant_folding(op->opc, t, op->args[2]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(bswap16):
-        CASE_OP_32_64(bswap32):
-        case INDEX_op_bswap64_i64:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          op->args[2]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_brcond2_i32:
             done = fold_brcond2(&ctx, op);
             break;
+        CASE_OP_32_64(bswap16):
+        CASE_OP_32_64(bswap32):
+        case INDEX_op_bswap64_i64:
+            done = fold_bswap(&ctx, op);
+            break;
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             done = fold_count_zeros(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_dup(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+        t = dup_const(TCGOP_VECE(op), t);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
+                               arg_info(op->args[2])->val);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+
+    if (args_are_copies(op->args[1], op->args[2])) {
+        op->opc = INDEX_op_dup_vec;
+        TCGOP_VECE(op) = MO_32;
+    }
+    return false;
+}
+
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             break;
 
-        case INDEX_op_dup_vec:
-            if (arg_is_const(op->args[1])) {
-                tmp = arg_info(op->args[1])->val;
-                tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        case INDEX_op_dup2_vec:
-            assert(TCG_TARGET_REG_BITS == 32);
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0],
-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
-                                           arg_info(op->args[2])->val));
-                continue;
-            } else if (args_are_copies(op->args[1], op->args[2])) {
-                op->opc = INDEX_op_dup_vec;
-                TCGOP_VECE(op) = MO_32;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
             break;
+        case INDEX_op_dup_vec:
+            done = fold_dup(&ctx, op);
+            break;
+        case INDEX_op_dup2_vec:
+            done = fold_dup2(&ctx, op);
+            break;
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
-- 
2.25.1

This is the final entry in the main switch that was in a
different form.  After this, we have the option to convert
the switch into a function dispatch table.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mov(OptContext *ctx, TCGOp *op)
+{
+    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+}
+
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /*
+         * Process each opcode.
+         * Sorted alphabetically by opcode as much as possible.
+         */
         switch (opc) {
-        CASE_OP_32_64_VEC(mov):
-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            break;
-
-        default:
-            break;
-
-        /* ---------------------------------------------------------- */
-        /* Sorted alphabetically by opcode as much as possible. */
-
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64_VEC(mov):
+            done = fold_mov(&ctx, op);
+            break;
         CASE_OP_32_64(movcond):
             done = fold_movcond(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        default:
+            break;
         }
 
         if (!done) {
-- 
2.25.1

Pull the "op r, a, a => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to @i. */
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(xor):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, a => mov r, a" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to identity. */
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
+ *
+ * The ordering of the transformations should be:
+ *   1) those that produce a constant
+ *   2) those that produce a copy
+ *   3) those that produce information about the result value.
  */
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(and):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, 0 => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to @i. */
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             continue;
         }
 
-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            if (arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Compute the type of the operation early.

There are at least 4 places that used a def->flags ladder
to determine the type of the operation being optimized.

There were two places that assumed !TCG_OPF_64BIT means
TCG_TYPE_I32, and so could potentially compute incorrect
results for vector operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;
+    TCGType type;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
-    const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
     uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     reset_ts(dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_mov_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_mov_i64;
-    } else {
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
         new_op = INDEX_op_mov_i32;
+        break;
+    case TCG_TYPE_I64:
+        new_op = INDEX_op_mov_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
+        new_op = INDEX_op_mov_vec;
+        break;
+    default:
+        g_assert_not_reached();
     }
     op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
     op->args[0] = dst;
     op->args[1] = src;
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    const TCGOpDef *def = &tcg_op_defs[op->opc];
-    TCGType type;
-    TCGTemp *tv;
-
-    if (def->flags & TCG_OPF_VECTOR) {
-        type = TCGOP_VECL(op) + TCG_TYPE_V64;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        type = TCG_TYPE_I64;
-    } else {
-        type = TCG_TYPE_I32;
-    }
-
     /* Convert movi to mov with constant temp. */
-    tv = tcg_constant_internal(type, val);
+    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     }
 }
 
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
+                                    uint64_t x, uint64_t y)
 {
-    const TCGOpDef *def = &tcg_op_defs[op];
     uint64_t res = do_constant_folding_2(op, x, y);
-    if (!(def->flags & TCG_OPF_64BIT)) {
+    if (type == TCG_TYPE_I32) {
         res = (int32_t)res;
     }
     return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
  * Return -1 if the condition can't be simplified,
  * and the result of the condition (0 or 1) if it can.
  */
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+static int do_constant_folding_cond(TCGType type, TCGArg x,
                                     TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
 
     if (arg_is_const(x) && arg_is_const(y)) {
-        const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-        if (def->flags & TCG_OPF_64BIT) {
-            return do_constant_folding_cond_64(xv, yv, c);
-        } else {
+        switch (type) {
+        case TCG_TYPE_I32:
             return do_constant_folding_cond_32(xv, yv, c);
+        case TCG_TYPE_I64:
+            return do_constant_folding_cond_64(xv, yv, c);
+        default:
+            /* Only scalar comparisons are optimizable */
+            return -1;
         }
     } else if (args_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = do_constant_folding(op->opc, t, 0);
+        t = do_constant_folding(op->opc, ctx->type, t, 0);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
         uint64_t t1 = arg_info(op->args[1])->val;
         uint64_t t2 = arg_info(op->args[2])->val;
 
-        t1 = do_constant_folding(op->opc, t1, t2);
+        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                      op->args[2], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
             goto do_brcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
-        t = do_constant_folding(op->opc, t, op->args[2]);
+        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         uint64_t t = arg_info(op->args[1])->val;
 
         if (t != 0) {
-            t = do_constant_folding(op->opc, t, 0);
+            t = do_constant_folding(op->opc, ctx->type, t, 0);
             return tcg_opt_gen_movi(ctx, op, op->args[0], t);
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
-    TCGOpcode opc = op->opc;
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
+        TCGOpcode opc;
 
-        opc = (opc == INDEX_op_movcond_i32
-               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+        switch (ctx->type) {
+        case TCG_TYPE_I32:
+            opc = INDEX_op_setcond_i32;
+            break;
+        case TCG_TYPE_I64:
+            opc = INDEX_op_setcond_i64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         if (tv == 1 && fv == 0) {
             op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
             goto do_setcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                      op->args[4], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
         copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
+        /* Pre-compute the type of the operation. */
+        if (def->flags & TCG_OPF_VECTOR) {
+            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
+        } else if (def->flags & TCG_OPF_64BIT) {
+            ctx.type = TCG_TYPE_I64;
+        } else {
+            ctx.type = TCG_TYPE_I32;
+        }
+
         /* For commutative operations make constant second argument */
         switch (opc) {
         CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (opc == INDEX_op_sub_i32) {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
-                } else if (opc == INDEX_op_sub_i64) {
+                    break;
+                case TCG_TYPE_I64:
                     neg_op = INDEX_op_neg_i64;
                     have_neg = TCG_TARGET_HAS_neg_i64;
-                } else if (TCG_TARGET_HAS_neg_vec) {
-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
-                    unsigned vece = TCGOP_VECE(op);
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-                } else {
                     break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    neg_op = INDEX_op_neg_vec;
+                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+                                                   TCGOP_VECE(op)) > 0;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_neg) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode not_op;
                 bool have_not;
 
-                if (def->flags & TCG_OPF_VECTOR) {
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                } else if (def->flags & TCG_OPF_64BIT) {
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                } else {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     not_op = INDEX_op_not_i32;
                     have_not = TCG_TARGET_HAS_not_i32;
+                    break;
+                case TCG_TYPE_I64:
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                    break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    not_op = INDEX_op_not_vec;
+                    have_not = TCG_TARGET_HAS_not_vec;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_not) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
         partmask = z_mask;
-        if (!(def->flags & TCG_OPF_64BIT)) {
+        if (ctx.type == TCG_TYPE_I32) {
             z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
-- 
2.25.1

Split out the conditional conversion from a more complex logical
operation to a simple NOT.  Create a couple more helpers to make
this easy for the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/*
+ * Convert @op to NOT, if NOT is supported by the host.
+ * Return true f the conversion is successful, which will still
+ * indicate that the processing is complete.
+ */
+static bool fold_not(OptContext *ctx, TCGOp *op);
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+{
+    TCGOpcode not_op;
+    bool have_not;
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        not_op = INDEX_op_not_i32;
+        have_not = TCG_TARGET_HAS_not_i32;
+        break;
+    case TCG_TYPE_I64:
+        not_op = INDEX_op_not_i64;
+        have_not = TCG_TARGET_HAS_not_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        not_op = INDEX_op_not_vec;
+        have_not = TCG_TARGET_HAS_not_vec;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_not) {
+        op->opc = not_op;
+        op->args[1] = op->args[idx];
+        return fold_not(ctx, op);
+    }
+    return false;
+}
+
+/* If the binary operation has first argument @i, fold to NOT. */
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return fold_to_not(ctx, op, 2);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to @i. */
 static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to NOT. */
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return fold_to_not(ctx, op, 1);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, -1)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_not(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    /* Because of fold_to_not, we want to always return true, via finish. */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_ix_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             }
             break;
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(nand):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64(nor):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(andc):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == -1) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        try_not:
-            {
-                TCGOpcode not_op;
-                bool have_not;
-
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    not_op = INDEX_op_not_i32;
-                    have_not = TCG_TARGET_HAS_not_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_not) {
-                    break;
-                }
-                op->opc = not_op;
-                reset_temp(op->args[0]);
-                op->args[1] = op->args[i];
-                continue;
-            }
         default:
             break;
         }
-- 
2.25.1

Even though there is only one user, place this more complex
conversion into its own helper.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+    /*
+     * Because of fold_sub_to_neg, we want to always return true,
+     * via finish_folding.
+     */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode neg_op;
+    bool have_neg;
+
+    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
+        return false;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        neg_op = INDEX_op_neg_i32;
+        have_neg = TCG_TARGET_HAS_neg_i32;
+        break;
+    case TCG_TYPE_I64:
+        neg_op = INDEX_op_neg_i64;
+        have_neg = TCG_TARGET_HAS_neg_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        neg_op = INDEX_op_neg_vec;
+        have_neg = (TCG_TARGET_HAS_neg_vec &&
+                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_neg) {
+        op->opc = neg_op;
+        op->args[1] = op->args[2];
+        return fold_neg(ctx, op);
+    }
+    return false;
+}
+
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 continue;
             }
             break;
-        CASE_OP_32_64_VEC(sub):
-            {
-                TCGOpcode neg_op;
-                bool have_neg;
-
-                if (arg_is_const(op->args[2])) {
-                    /* Proceed with possible constant folding. */
-                    break;
-                }
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    neg_op = INDEX_op_neg_i32;
-                    have_neg = TCG_TARGET_HAS_neg_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    neg_op = INDEX_op_neg_i64;
-                    have_neg = TCG_TARGET_HAS_neg_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-                                                   TCGOP_VECE(op)) > 0;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_neg) {
-                    break;
-                }
-                if (arg_is_const(op->args[1])
-                    && arg_info(op->args[1])->val == 0) {
-                    op->opc = neg_op;
-                    reset_temp(op->args[0]);
-                    op->args[1] = op->args[2];
-                    continue;
-                }
-            }
-            break;
         default:
             break;
         }
-- 
2.25.1

Pull the "op r, a, i => mov r, a" optimization into a function,
and use them in the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to identity. */
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to NOT. */
 static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Pull the "op r, 0, b => movi r, 0" optimization into a function,
and use it in fold_shift.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
     return false;
 }
 
+/* If the binary operation has first argument @i, fold to @i. */
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has first argument @i, fold to NOT. */
 static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
-           and "sub r, 0, a => neg r, a" case.  */
-        switch (opc) {
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Move all of the known-zero optimizations into the per-opcode
functions.  Use fold_masks when there is a possibility of the
result being determined, and simply set ctx->z_mask otherwise.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 251 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGTempSet temps_used;
 
     /* In flight values from optimization. */
-    uint64_t z_mask;
+    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     TCGType type;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    uint64_t a_mask = ctx->a_mask;
+    uint64_t z_mask = ctx->z_mask;
+
+    /*
+     * 32-bit ops generate 32-bit results.  For the result is zero test
+     * below, we can ignore high bits, but for further optimizations we
+     * need to record that the high bits contain garbage.
+     */
+    if (ctx->type == TCG_TYPE_I32) {
+        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+        a_mask &= MAKE_64BIT_MASK(0, 32);
+        z_mask &= MAKE_64BIT_MASK(0, 32);
+    }
+
+    if (z_mask == 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+    }
+    if (a_mask == 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * Convert @op to NOT, if NOT is supported by the host.
  * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1, z2;
+
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+    z2 = arg_info(op->args[2])->z_mask;
+    ctx->z_mask = z1 & z2;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer affected bits from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        ctx->a_mask = z1 & ~z2;
+    }
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer anything from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+        ctx->a_mask = z1 & ~z2;
+        z1 &= z2;
+    }
+    ctx->z_mask = z1;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, sign;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask = arg_info(op->args[1])->z_mask;
+    switch (op->opc) {
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+        z_mask = bswap16(z_mask);
+        sign = INT16_MIN;
+        break;
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+        z_mask = bswap32(z_mask);
+        sign = INT32_MIN;
+        break;
+    case INDEX_op_bswap64_i64:
+        z_mask = bswap64(z_mask);
+        sign = INT64_MIN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+    case TCG_BSWAP_OZ:
+        break;
+    case TCG_BSWAP_OS:
+        /* If the sign bit may be 1, force all the bits above to 1. */
+        if (z_mask & sign) {
+            z_mask |= sign;
+        }
+        break;
+    default:
+        /* The high bits are undefined: force all bits above the sign to 1. */
+        z_mask |= sign << 1;
+        break;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
     }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        z_mask = 31;
+        break;
+    case TCG_TYPE_I64:
+        z_mask = 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+
     return false;
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        ctx->z_mask = 32 | 31;
+        break;
+    case TCG_TYPE_I64:
+        ctx->z_mask = 64 | 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         t1 = deposit64(t1, op->args[3], op->args[4], t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
+
+    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                            op->args[3], op->args[4],
+                            arg_info(op->args[2])->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         t = extract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask, sign;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8s):
+        sign = INT8_MIN;
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16s):
+        sign = INT16_MIN;
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_ext_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32s_i64:
+        sign = INT32_MIN;
+        z_mask = (uint32_t)z_mask;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (z_mask & sign) {
+        z_mask |= sign;
+    } else if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8u):
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16u):
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extu_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32u_i64:
+        z_mask = (uint32_t)z_mask;
+        break;
+    case INDEX_op_extrh_i64_i32:
+        type_change = true;
+        z_mask >>= 32;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ctx->z_mask = z_mask;
+    if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    return fold_masks(ctx, op);
 }
 
 static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
+    ctx->z_mask = arg_info(op->args[3])->z_mask
+                | arg_info(op->args[4])->z_mask;
+
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
+
+    /* Set to 1 all bits to the left of the rightmost.  */
+    z_mask = arg_info(op->args[1])->z_mask;
+    ctx->z_mask = -(z_mask & -z_mask);
+
     /*
      * Because of fold_sub_to_neg, we want to always return true,
      * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+    MemOp mop = get_memop(oi);
+    int width = 8 * memop_size(mop);
+
+    if (!(mop & MO_SIGN) && width < 64) {
+        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    }
+
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
+
+    ctx->z_mask = 1;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
         op->opc = INDEX_op_setcond_i32;
         break;
     }
+
+    ctx->z_mask = 1;
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
+    int64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
         t = sextract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0 && z_mask >= 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
+
+    if (arg_is_const(op->args[2])) {
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
+                                          arg_info(op->args[1])->z_mask,
+                                          arg_info(op->args[2])->val);
+        return fold_masks(ctx, op);
+    }
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
     return fold_addsub2_i32(ctx, op, false);
 }
 
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+{
+    /* We can't do any folding with a load, but we can record bits. */
+    switch (op->opc) {
+    CASE_OP_32_64(ld8u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        break;
+    CASE_OP_32_64(ld16u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        break;
+    case INDEX_op_ld32u_i64:
+        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
         bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify using known-zero bits. Currently only ops with a single
-           output argument is supported. */
-        z_mask = -1;
-        affected = -1;
-        switch (opc) {
-        CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext8u):
-            z_mask = 0xff;
-            goto and_const;
-        CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext16u):
-            z_mask = 0xffff;
-            goto and_const;
-        case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_ext32u_i64:
-            z_mask = 0xffffffffU;
-            goto and_const;
-
-        CASE_OP_32_64(and):
-            z_mask = arg_info(op->args[2])->z_mask;
-            if (arg_is_const(op->args[2])) {
-        and_const:
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-            break;
-
-        case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_extu_i32_i64:
-            /* We do not compute affected as it is a size changing op.  */
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-
-        CASE_OP_32_64(andc):
-            /* Known-zeros does not imply known-ones.  Therefore unless
-               op->args[2] is constant, we can't infer anything from it.  */
-            if (arg_is_const(op->args[2])) {
-                z_mask = ~arg_info(op->args[2])->z_mask;
-                goto and_const;
-            }
-            /* But we certainly know nothing outside args[1] may be set. */
-            z_mask = arg_info(op->args[1])->z_mask;
-            break;
-
-        case INDEX_op_sar_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_sar_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_shr_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_shr_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_extrl_i64_i32:
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-        case INDEX_op_extrh_i64_i32:
-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-            break;
-
-        CASE_OP_32_64(shl):
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                z_mask = arg_info(op->args[1])->z_mask << tmp;
-            }
-            break;
-
-        CASE_OP_32_64(neg):
-            /* Set to 1 all bits to the left of the rightmost.  */
-            z_mask = -(arg_info(op->args[1])->z_mask
-                       & -arg_info(op->args[1])->z_mask);
-            break;
-
-        CASE_OP_32_64(deposit):
-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                               op->args[3], op->args[4],
-                               arg_info(op->args[2])->z_mask);
-            break;
-
-        CASE_OP_32_64(extract):
-            z_mask = extract64(arg_info(op->args[1])->z_mask,
-                               op->args[2], op->args[3]);
-            if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-        CASE_OP_32_64(sextract):
-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-                                op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(xor):
-            z_mask = arg_info(op->args[1])->z_mask
-                   | arg_info(op->args[2])->z_mask;
-            break;
-
-        case INDEX_op_clz_i32:
-        case INDEX_op_ctz_i32:
-            z_mask = arg_info(op->args[2])->z_mask | 31;
-            break;
-
-        case INDEX_op_clz_i64:
-        case INDEX_op_ctz_i64:
-            z_mask = arg_info(op->args[2])->z_mask | 63;
-            break;
-
-        case INDEX_op_ctpop_i32:
-            z_mask = 32 | 31;
-            break;
-        case INDEX_op_ctpop_i64:
-            z_mask = 64 | 63;
-            break;
-
-        CASE_OP_32_64(setcond):
-        case INDEX_op_setcond2_i32:
-            z_mask = 1;
-            break;
-
-        CASE_OP_32_64(movcond):
-            z_mask = arg_info(op->args[3])->z_mask
-                   | arg_info(op->args[4])->z_mask;
-            break;
-
-        CASE_OP_32_64(ld8u):
-            z_mask = 0xff;
-            break;
-        CASE_OP_32_64(ld16u):
-            z_mask = 0xffff;
-            break;
-        case INDEX_op_ld32u_i64:
-            z_mask = 0xffffffffu;
-            break;
-
-        CASE_OP_32_64(qemu_ld):
-            {
-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-                MemOp mop = get_memop(oi);
-                if (!(mop & MO_SIGN)) {
-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-                }
-            }
-            break;
-
-        CASE_OP_32_64(bswap16):
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffff) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap16(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int16_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(16, 48);
-                break;
-            }
-            break;
-
-        case INDEX_op_bswap32_i64:
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffffffffu) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap32(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int32_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(32, 32);
-                break;
-            }
-            break;
-
-        default:
-            break;
-        }
-
-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-           below, we can ignore high bits, but for further optimizations we
-           need to record that the high bits contain garbage.  */
-        partmask = z_mask;
-        if (ctx.type == TCG_TYPE_I32) {
-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-            partmask &= 0xffffffffu;
-            affected &= 0xffffffffu;
-        }
-        ctx.z_mask = z_mask;
-
-        if (partmask == 0) {
-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-            continue;
-        }
-        if (affected == 0) {
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
-        }
+        /* Assume all bits affected, and no bits known zero. */
+        ctx.a_mask = -1;
+        ctx.z_mask = -1;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32u_i64:
+            done = fold_tcg_ld(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.25.1

Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
and muls2_i64.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint32_t a = arg_info(op->args[2])->val;
-        uint32_t b = arg_info(op->args[3])->val;
-        uint64_t r = (uint64_t)a * b;
+        uint64_t a = arg_info(op->args[2])->val;
+        uint64_t b = arg_info(op->args[3])->val;
+        uint64_t h, l;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
+
+        switch (op->opc) {
+        case INDEX_op_mulu2_i32:
+            l = (uint64_t)(uint32_t)a * (uint32_t)b;
+            h = (int32_t)(l >> 32);
+            l = (int32_t)l;
+            break;
+        case INDEX_op_muls2_i32:
+            l = (int64_t)(int32_t)a * (int32_t)b;
+            h = l >> 32;
+            l = (int32_t)l;
+            break;
+        case INDEX_op_mulu2_i64:
+            mulu64(&l, &h, a, b);
+            break;
+        case INDEX_op_muls2_i64:
+            muls64(&l, &h, a, b);
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, l);
+        tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
-        case INDEX_op_mulu2_i32:
-            done = fold_mulu2_i32(&ctx, op);
+        CASE_OP_32_64(muls2):
+        CASE_OP_32_64(mulu2):
+            done = fold_multiply2(&ctx, op);
             break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
-- 
2.25.1

Rename to fold_addsub2.
Use Int128 to implement the wider operation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/int128.h"
 #include "tcg/tcg-op.h"
 #include "tcg-internal.h"
 
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-        uint32_t al = arg_info(op->args[2])->val;
-        uint32_t ah = arg_info(op->args[3])->val;
-        uint32_t bl = arg_info(op->args[4])->val;
-        uint32_t bh = arg_info(op->args[5])->val;
-        uint64_t a = ((uint64_t)ah << 32) | al;
-        uint64_t b = ((uint64_t)bh << 32) | bl;
+        uint64_t al = arg_info(op->args[2])->val;
+        uint64_t ah = arg_info(op->args[3])->val;
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
 
-        if (add) {
-            a += b;
+        if (ctx->type == TCG_TYPE_I32) {
+            uint64_t a = deposit64(al, 32, 32, ah);
+            uint64_t b = deposit64(bl, 32, 32, bh);
+
+            if (add) {
+                a += b;
+            } else {
+                a -= b;
+            }
+
+            al = sextract64(a, 0, 32);
+            ah = sextract64(a, 32, 32);
         } else {
-            a -= b;
+            Int128 a = int128_make128(al, ah);
+            Int128 b = int128_make128(bl, bh);
+
+            if (add) {
+                a = int128_add(a, b);
+            } else {
+                a = int128_sub(a, b);
+            }
+
+            al = int128_getlo(a);
+            ah = int128_gethi(a);
         }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, al);
+        tcg_opt_gen_movi(ctx, op2, rh, ah);
         return true;
     }
     return false;
 }
 
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, true);
+    return fold_addsub2(ctx, op, true);
 }
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, false);
+    return fold_addsub2(ctx, op, false);
 }
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
-        case INDEX_op_add2_i32:
-            done = fold_add2_i32(&ctx, op);
+        CASE_OP_32_64(add2):
+            done = fold_add2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-        case INDEX_op_sub2_i32:
-            done = fold_sub2_i32(&ctx, op);
+        CASE_OP_32_64(sub2):
+            done = fold_sub2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
-- 
2.25.1

Most of these are handled by creating a fold_const2_commutative
to handle all of the binary operators.  The rest were already
handled on a case-by-case basis in the switch, and have their
own fold function in which to place the call.

We now have only one major switch on TCGOpcode.

Introduce NO_DEST and a block comment for swap_commutative in
order to make the handling of brcond and movcond opcodes cleaner.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 1 file changed, 70 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     return -1;
 }
 
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
     return fold_addsub2(ctx, op, true);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     uint64_t z1, z2;
 
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     if (i >= 0) {
         goto do_brcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
         uint64_t a = arg_info(op->args[2])->val;
         uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
     if (i >= 0) {
         goto do_setcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Assume all bits affected, and no bits known zero. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
-- 
2.25.1

This "garbage" setting pre-dates the addition of the type
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
and INDEX_op_extr{l,h}_i64_i32.

So now we have a definitive points at which to adjust z_mask
to eliminate such bits from the 32-bit operands.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *src_ts = arg_temp(src);
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+    TCGTemp *tv;
 
+    if (ctx->type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(ctx->type, val);
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     uint64_t z_mask = ctx->z_mask;
 
     /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        ctx->z_mask = z_mask;
     }
 
     if (z_mask == 0) {
-- 
2.25.1

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1

Sign repetitions are perforce all identical, whether they are 1 or 0.
Bitwise operations preserve the relative quantity of the repetitions.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
     z2 = arg_info(op->args[2])->z_mask;
     ctx->z_mask = z1 & z2;
 
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     }
     ctx->z_mask = z1;
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[3])->z_mask
                 | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
 
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
         return true;
     }
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
     /* Because of fold_to_not, we want to always return true, via finish. */
     finish_folding(ctx, op);
     return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
-- 
2.25.1

For constant shifts, we can simply shift the s_mask.

For variable shifts, we know that sar does not reduce
the s_mask, which helps for sequences like

ext32s_i64  t, in
    sar_i64     t, t, v
    ext32s_i64  out, t

allowing the final extend to be eliminated.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
     return ~(~0ull >> rep);
 }
 
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask, z_mask, sign;
+
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
 
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
     if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
         return fold_masks(ctx, op);
     }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
     return false;
 }
 
-- 
2.25.1