Series comparison

-[PULL 00/16] tcg patch queue
+[PULL 00/56] tcg patch queue
-This is v4 of my notdirty + rom patch set with two suggested name
+The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:
 changes (qemu_build_not_reached, TLB_DISCARD_WRITE) from David and Alex.
+  Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)
 r~
 The following changes since commit 240ab11fb72049d6373cbbec8d788f8e411a00bc:
   Merge remote-tracking branch 'remotes/aperard/tags/pull-xen-20190924' into staging (2019-09-24 15:36:31 +0100)
 are available in the Git repository at:
-  https://github.com/rth7680/qemu.git tags/pull-tcg-20190925
+  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027
-for you to fetch changes up to ae57db63acf5a0399232f852acc5c1d83ef63400:
+for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:
-  cputlb: Pass retaddr to tb_check_watchpoint (2019-09-25 10:56:28 -0700)
+  tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)
 ----------------------------------------------------------------
-Fixes for TLB_BSWAP
+Improvements to qemu/int128
-Coversion of NOTDIRTY and ROM handling to cputlb
+Fixes for 128/64 division.
-Followup cleanups to cputlb
+Cleanup tcg/optimize.c
 Optimize redundant sign extensions
 ----------------------------------------------------------------
-Richard Henderson (16):
+Frédéric Pétrot (1):
-      exec: Use TARGET_PAGE_BITS_MIN for TLB flags
+      qemu/int128: Add int128_{not,xor}
       cputlb: Disable __always_inline__ without optimization
       qemu/compiler.h: Add qemu_build_not_reached
       cputlb: Use qemu_build_not_reached in load/store_helpers
       cputlb: Split out load/store_memop
       cputlb: Introduce TLB_BSWAP
       exec: Adjust notdirty tracing
       cputlb: Move ROM handling from I/O path to TLB path
       cputlb: Move NOTDIRTY handling from I/O path to TLB path
       cputlb: Partially inline memory_region_section_get_iotlb
       cputlb: Merge and move memory_notdirty_write_{prepare,complete}
       cputlb: Handle TLB_NOTDIRTY in probe_access
       cputlb: Remove cpu->mem_io_vaddr
       cputlb: Remove tb_invalidate_phys_page_range is_cpu_write_access
       cputlb: Pass retaddr to tb_invalidate_phys_page_fast
       cputlb: Pass retaddr to tb_check_watchpoint
- accel/tcg/translate-all.h      |   8 +-
+Luis Pires (4):
- include/exec/cpu-all.h         |  23 ++-
+      host-utils: move checks out of divu128/divs128
- include/exec/cpu-common.h      |   3 -
+      host-utils: move udiv_qrnnd() to host-utils
- include/exec/exec-all.h        |   6 +-
+      host-utils: add 128-bit quotient support to divu128/divs128
- include/exec/memory-internal.h |  65 --------
+      host-utils: add unit tests for divu128/divs128
  include/hw/core/cpu.h          |   2 -
  include/qemu/compiler.h        |  26 +++
  accel/tcg/cputlb.c             | 348 +++++++++++++++++++++++++----------------
  accel/tcg/translate-all.c      |  51 +++---
  exec.c                         | 158 +------------------
  hw/core/cpu.c                  |   1 -
  memory.c                       |  20 ---
  trace-events                   |   4 +-
 files changed, 288 insertions(+), 427 deletions(-)
+Richard Henderson (51):
+      tcg/optimize: Rename "mask" to "z_mask"
+      tcg/optimize: Split out OptContext
+      tcg/optimize: Remove do_default label
+      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+      tcg/optimize: Move prev_mb into OptContext
+      tcg/optimize: Split out init_arguments
+      tcg/optimize: Split out copy_propagate
+      tcg/optimize: Split out fold_call
+      tcg/optimize: Drop nb_oargs, nb_iargs locals
+      tcg/optimize: Change fail return for do_constant_folding_cond*
+      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
+      tcg/optimize: Split out finish_folding
+      tcg/optimize: Use a boolean to avoid a mass of continues
+      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+      tcg/optimize: Split out fold_const{1,2}
+      tcg/optimize: Split out fold_setcond2
+      tcg/optimize: Split out fold_brcond2
+      tcg/optimize: Split out fold_brcond
+      tcg/optimize: Split out fold_setcond
+      tcg/optimize: Split out fold_mulu2_i32
+      tcg/optimize: Split out fold_addsub2_i32
+      tcg/optimize: Split out fold_movcond
+      tcg/optimize: Split out fold_extract2
+      tcg/optimize: Split out fold_extract, fold_sextract
+      tcg/optimize: Split out fold_deposit
+      tcg/optimize: Split out fold_count_zeros
+      tcg/optimize: Split out fold_bswap
+      tcg/optimize: Split out fold_dup, fold_dup2
+      tcg/optimize: Split out fold_mov
+      tcg/optimize: Split out fold_xx_to_i
+      tcg/optimize: Split out fold_xx_to_x
+      tcg/optimize: Split out fold_xi_to_i
+      tcg/optimize: Add type to OptContext
+      tcg/optimize: Split out fold_to_not
+      tcg/optimize: Split out fold_sub_to_neg
+      tcg/optimize: Split out fold_xi_to_x
+      tcg/optimize: Split out fold_ix_to_i
+      tcg/optimize: Split out fold_masks
+      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
+      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
+      tcg/optimize: Sink commutative operand swapping into fold functions
+      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
+      tcg/optimize: Use fold_xx_to_i for orc
+      tcg/optimize: Use fold_xi_to_x for mul
+      tcg/optimize: Use fold_xi_to_x for div
+      tcg/optimize: Use fold_xx_to_i for rem
+      tcg/optimize: Optimize sign extensions
+      tcg/optimize: Propagate sign info for logical operations
+      tcg/optimize: Propagate sign info for setcond
+      tcg/optimize: Propagate sign info for bit counting
+      tcg/optimize: Propagate sign info for shifting
+ include/fpu/softfloat-macros.h |   82 --
+ include/hw/clock.h             |    5 +-
+ include/qemu/host-utils.h      |  121 +-
+ include/qemu/int128.h          |   20 +
+ target/ppc/int_helper.c        |   23 +-
+ tcg/optimize.c                 | 2644 ++++++++++++++++++++++++----------------
+ tests/unit/test-div128.c       |  197 +++
+ util/host-utils.c              |  147 ++-
+ tests/unit/meson.build         |    1 +
+files changed, 2053 insertions(+), 1187 deletions(-)
+ create mode 100644 tests/unit/test-div128.c

-[PULL 05/16] cputlb: Split out load/store_memop
+[PULL 01/56] qemu/int128: Add int128_{not,xor}
-We will shortly be using these more than once.
+From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Addition of not and xor on 128-bit integers.
-Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
 Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
 Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
 [rth: Split out logical operations.]
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 107 +++++++++++++++++++++++----------------------
+ include/qemu/int128.h | 20 ++++++++++++++++++++
-file changed, 55 insertions(+), 52 deletions(-)
+file changed, 20 insertions(+)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/include/qemu/int128.h b/include/qemu/int128.h
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/include/qemu/int128.h
-+++ b/accel/tcg/cputlb.c
++++ b/include/qemu/int128.h
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
- typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
+     return a;
-                                 TCGMemOpIdx oi, uintptr_t retaddr);
+ }
-+static inline uint64_t QEMU_ALWAYS_INLINE
++static inline Int128 int128_not(Int128 a)
 +load_memop(const void *haddr, MemOp op)
 +{
-+    switch (op) {
++    return ~a;
 +    case MO_UB:
 +        return ldub_p(haddr);
 +    case MO_BEUW:
 +        return lduw_be_p(haddr);
 +    case MO_LEUW:
 +        return lduw_le_p(haddr);
 +    case MO_BEUL:
 +        return (uint32_t)ldl_be_p(haddr);
 +    case MO_LEUL:
 +        return (uint32_t)ldl_le_p(haddr);
 +    case MO_BEQ:
 +        return ldq_be_p(haddr);
 +    case MO_LEQ:
 +        return ldq_le_p(haddr);
 +    default:
 +        qemu_build_not_reached();
 +    }
 +}
 +
- static inline uint64_t QEMU_ALWAYS_INLINE
+ static inline Int128 int128_and(Int128 a, Int128 b)
- load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
+ {
-             uintptr_t retaddr, MemOp op, bool code_read,
+     return a & b;
-@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
+     return a | b;
   do_aligned_access:
      haddr = (void *)((uintptr_t)addr + entry->addend);
 -    switch (op) {
 -    case MO_UB:
 -        res = ldub_p(haddr);
 -        break;
 -    case MO_BEUW:
 -        res = lduw_be_p(haddr);
 -        break;
 -    case MO_LEUW:
 -        res = lduw_le_p(haddr);
 -        break;
 -    case MO_BEUL:
 -        res = (uint32_t)ldl_be_p(haddr);
 -        break;
 -    case MO_LEUL:
 -        res = (uint32_t)ldl_le_p(haddr);
 -        break;
 -    case MO_BEQ:
 -        res = ldq_be_p(haddr);
 -        break;
 -    case MO_LEQ:
 -        res = ldq_le_p(haddr);
 -        break;
 -    default:
 -        qemu_build_not_reached();
 -    }
 -
 -    return res;
 +    return load_memop(haddr, op);
  }
- /*
++static inline Int128 int128_xor(Int128 a, Int128 b)
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
   * Store Helpers
   */
 +static inline void QEMU_ALWAYS_INLINE
 +store_memop(void *haddr, uint64_t val, MemOp op)
 +{
-+    switch (op) {
++    return a ^ b;
 +    case MO_UB:
 +        stb_p(haddr, val);
 +        break;
 +    case MO_BEUW:
 +        stw_be_p(haddr, val);
 +        break;
 +    case MO_LEUW:
 +        stw_le_p(haddr, val);
 +        break;
 +    case MO_BEUL:
 +        stl_be_p(haddr, val);
 +        break;
 +    case MO_LEUL:
 +        stl_le_p(haddr, val);
 +        break;
 +    case MO_BEQ:
 +        stq_be_p(haddr, val);
 +        break;
 +    case MO_LEQ:
 +        stq_le_p(haddr, val);
 +        break;
 +    default:
 +        qemu_build_not_reached();
 +    }
 +}
 +
- static inline void QEMU_ALWAYS_INLINE
+ static inline Int128 int128_rshift(Int128 a, int n)
- store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+ {
-              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
+     return a >> n;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
+     return int128_make128(a, (a < 0) ? -1 : 0);
   do_aligned_access:
      haddr = (void *)((uintptr_t)addr + entry->addend);
 -    switch (op) {
 -    case MO_UB:
 -        stb_p(haddr, val);
 -        break;
 -    case MO_BEUW:
 -        stw_be_p(haddr, val);
 -        break;
 -    case MO_LEUW:
 -        stw_le_p(haddr, val);
 -        break;
 -    case MO_BEUL:
 -        stl_be_p(haddr, val);
 -        break;
 -    case MO_LEUL:
 -        stl_le_p(haddr, val);
 -        break;
 -    case MO_BEQ:
 -        stq_be_p(haddr, val);
 -        break;
 -    case MO_LEQ:
 -        stq_le_p(haddr, val);
 -        break;
 -    default:
 -        qemu_build_not_reached();
 -    }
 +    store_memop(haddr, val, op);
  }
- void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
++static inline Int128 int128_not(Int128 a)
 +{
 +    return int128_make128(~a.lo, ~a.hi);
 +}
 +
  static inline Int128 int128_and(Int128 a, Int128 b)
  {
      return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
      return int128_make128(a.lo | b.lo, a.hi | b.hi);
  }
 +static inline Int128 int128_xor(Int128 a, Int128 b)
 +{
 +    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
 +}
 +
  static inline Int128 int128_rshift(Int128 a, int n)
  {
      int64_t h;
 --
-.17.1
+.25.1

-New patch
+[PULL 02/56] host-utils: move checks out of divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 In preparation for changing the divu128/divs128 implementations
 to allow for quotients larger than 64 bits, move the div-by-zero
 and overflow checks to the callers.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |  5 +++--
  include/qemu/host-utils.h | 34 ++++++++++++---------------------
  target/ppc/int_helper.c   | 14 +++++++++-----
  util/host-utils.c         | 40 ++++++++++++++++++---------------------
 files changed, 42 insertions(+), 51 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
          return 0;
      }
      /*
 -     * Ignore divu128() return value as we've caught div-by-zero and don't
 -     * need different behaviour for overflow.
 +     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 +     * divu128 does not return a valid truncated quotient, so the result will
 +     * be wrong.
       */
      divu128(&lo, &hi, clk->period);
      return lo;
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 -        __uint128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result > UINT64_MAX;
 -    }
 +    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
 +    __uint128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
 -static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    if (divisor == 0) {
 -        return 1;
 -    } else {
 -        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 -        __int128_t result = dividend / divisor;
 -        *plow = result;
 -        *phigh = dividend % divisor;
 -        return result != *plow;
 -    }
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t result = dividend / divisor;
 +    *plow = result;
 +    *phigh = dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
      uint64_t rt = 0;
      int overflow = 0;
 -    overflow = divu128(&rt, &ra, rb);
 -
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || ra >= rb)) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divu128(&rt, &ra, rb);
      }
      if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
      int64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
 -    int overflow = divs128(&rt, &ra, rb);
 +    int overflow = 0;
 -    if (unlikely(overflow)) {
 +    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
 +        overflow = 1;
          rt = 0; /* Undefined */
 +    } else {
 +        divs128(&rt, &ra, rb);
      }
      if (oe) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
      *phigh = rh;
  }
 -/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
 -/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
 -/* remainder via phigh. */
 -int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +/*
 + * Unsigned 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
      unsigned i;
      uint64_t carry = 0;
 -    if (divisor == 0) {
 -        return 1;
 -    } else if (dhi == 0) {
 +    if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
          *phigh = dlo % divisor;
 -        return 0;
 -    } else if (dhi >= divisor) {
 -        return 1;
      } else {
          for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
          *plow = dlo;
          *phigh = dhi;
 -        return 0;
      }
  }
 -int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +/*
 + * Signed 128-by-64 division. Returns quotient via plow and
 + * remainder via phigh.
 + * The result must fit in 64 bits (plow) - otherwise, the result
 + * is undefined.
 + * This function will cause a division by zero if passed a zero divisor.
 + */
 +void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
  {
      int sgn_dvdnd = *phigh < 0;
      int sgn_divsr = divisor < 0;
 -    int overflow = 0;
      if (sgn_dvdnd) {
          *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
          divisor = 0 - divisor;
      }
 -    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 +    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
      if (sgn_dvdnd  ^ sgn_divsr) {
          *plow = 0 - *plow;
      }
 -
 -    if (!overflow) {
 -        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
 -            overflow = 1;
 -        }
 -    }
 -
 -    return overflow;
  }
  #endif
 --
 .25.1

-New patch
+[PULL 03/56] host-utils: move udiv_qrnnd() to host-utils
+From: Luis Pires <luis.pires@eldorado.org.br>
 Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
 so it can be reused by divu128().
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/fpu/softfloat-macros.h | 82 ----------------------------------
  include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 files changed, 81 insertions(+), 82 deletions(-)
 diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/fpu/softfloat-macros.h
 +++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
   * so some portions are provided under:
   *  the SoftFloat-2a license
   *  the BSD license
 - *  GPL-v2-or-later
   *
   * Any future contributions to this file after December 1st 2014 will be
   * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
   * THE POSSIBILITY OF SUCH DAMAGE.
   */
 -/* Portions of this work are licensed under the terms of the GNU GPL,
 - * version 2 or later. See the COPYING file in the top-level directory.
 - */
 -
  #ifndef FPU_SOFTFLOAT_MACROS_H
  #define FPU_SOFTFLOAT_MACROS_H
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  }
 -/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 - * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 - *
 - * Licensed under the GPLv2/LGPLv3
 - */
 -static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 -                                  uint64_t n0, uint64_t d)
 -{
 -#if defined(__x86_64__)
 -    uint64_t q;
 -    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 -    return q;
 -#elif defined(__s390x__) && !defined(__clang__)
 -    /* Need to use a TImode type to get an even register pair for DLGR.  */
 -    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 -    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 -    *r = n >> 64;
 -    return n;
 -#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 -    /* From Power ISA 2.06, programming note for divdeu.  */
 -    uint64_t q1, q2, Q, r1, r2, R;
 -    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 -        : "=&r"(q1), "=r"(q2)
 -        : "r"(n1), "r"(n0), "r"(d));
 -    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 -    r2 = n0 - (q2 * d);
 -    Q = q1 + q2;
 -    R = r1 + r2;
 -    if (R >= d || R < r2) { /* overflow implies R > d */
 -        Q += 1;
 -        R -= d;
 -    }
 -    *r = R;
 -    return Q;
 -#else
 -    uint64_t d0, d1, q0, q1, r1, r0, m;
 -
 -    d0 = (uint32_t)d;
 -    d1 = d >> 32;
 -
 -    r1 = n1 % d1;
 -    q1 = n1 / d1;
 -    m = q1 * d0;
 -    r1 = (r1 << 32) | (n0 >> 32);
 -    if (r1 < m) {
 -        q1 -= 1;
 -        r1 += d;
 -        if (r1 >= d) {
 -            if (r1 < m) {
 -                q1 -= 1;
 -                r1 += d;
 -            }
 -        }
 -    }
 -    r1 -= m;
 -
 -    r0 = r1 % d1;
 -    q0 = r1 / d1;
 -    m = q0 * d0;
 -    r0 = (r0 << 32) | (uint32_t)n0;
 -    if (r0 < m) {
 -        q0 -= 1;
 -        r0 += d;
 -        if (r0 >= d) {
 -            if (r0 < m) {
 -                q0 -= 1;
 -                r0 += d;
 -            }
 -        }
 -    }
 -    r0 -= m;
 -
 -    *r = r0;
 -    return (q1 << 32) | q0;
 -#endif
 -}
 -
  /*----------------------------------------------------------------------------
  | Returns an approximation to the square root of the 32-bit significand given
  | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
   * THE SOFTWARE.
   */
 +/* Portions of this work are licensed under the terms of the GNU GPL,
 + * version 2 or later. See the COPYING file in the top-level directory.
 + */
 +
  #ifndef HOST_UTILS_H
  #define HOST_UTILS_H
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
   */
  void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 +/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 + * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 + *
 + * Licensed under the GPLv2/LGPLv3
 + */
 +static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 +                                  uint64_t n0, uint64_t d)
 +{
 +#if defined(__x86_64__)
 +    uint64_t q;
 +    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 +    return q;
 +#elif defined(__s390x__) && !defined(__clang__)
 +    /* Need to use a TImode type to get an even register pair for DLGR.  */
 +    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 +    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 +    *r = n >> 64;
 +    return n;
 +#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 +    /* From Power ISA 2.06, programming note for divdeu.  */
 +    uint64_t q1, q2, Q, r1, r2, R;
 +    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 +        : "=&r"(q1), "=r"(q2)
 +        : "r"(n1), "r"(n0), "r"(d));
 +    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 +    r2 = n0 - (q2 * d);
 +    Q = q1 + q2;
 +    R = r1 + r2;
 +    if (R >= d || R < r2) { /* overflow implies R > d */
 +        Q += 1;
 +        R -= d;
 +    }
 +    *r = R;
 +    return Q;
 +#else
 +    uint64_t d0, d1, q0, q1, r1, r0, m;
 +
 +    d0 = (uint32_t)d;
 +    d1 = d >> 32;
 +
 +    r1 = n1 % d1;
 +    q1 = n1 / d1;
 +    m = q1 * d0;
 +    r1 = (r1 << 32) | (n0 >> 32);
 +    if (r1 < m) {
 +        q1 -= 1;
 +        r1 += d;
 +        if (r1 >= d) {
 +            if (r1 < m) {
 +                q1 -= 1;
 +                r1 += d;
 +            }
 +        }
 +    }
 +    r1 -= m;
 +
 +    r0 = r1 % d1;
 +    q0 = r1 / d1;
 +    m = q0 * d0;
 +    r0 = (r0 << 32) | (uint32_t)n0;
 +    if (r0 < m) {
 +        q0 -= 1;
 +        r0 += d;
 +        if (r0 >= d) {
 +            if (r0 < m) {
 +                q0 -= 1;
 +                r0 += d;
 +            }
 +        }
 +    }
 +    r0 -= m;
 +
 +    *r = r0;
 +    return (q1 << 32) | q0;
 +#endif
 +}
 +
  #endif
 --
 .25.1

-New patch
+[PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 These will be used to implement new decimal floating point
 instructions from Power ISA 3.1.
 The remainder is now returned directly by divu128/divs128,
 freeing up phigh to receive the high 64 bits of the quotient.
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  include/hw/clock.h        |   6 +-
  include/qemu/host-utils.h |  20 ++++--
  target/ppc/int_helper.c   |   9 +--
  util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 files changed, 108 insertions(+), 60 deletions(-)
 diff --git a/include/hw/clock.h b/include/hw/clock.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/clock.h
 +++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
      if (clk->period == 0) {
          return 0;
      }
 -    /*
 -     * BUG: when CONFIG_INT128 is not defined, the current implementation of
 -     * divu128 does not return a valid truncated quotient, so the result will
 -     * be wrong.
 -     */
 +
      divu128(&lo, &hi, clk->period);
      return lo;
  }
 diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/qemu/host-utils.h
 +++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
      return (__int128_t)a * b / c;
  }
 -static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
 +                               uint64_t divisor)
  {
      __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
      __uint128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
 -static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
 +                              int64_t divisor)
  {
 -    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
 +    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
      __int128_t result = dividend / divisor;
 +
      *plow = result;
 -    *phigh = dividend % divisor;
 +    *phigh = result >> 64;
 +    return dividend % divisor;
  }
  #else
  void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
  void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
  static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
  {
 diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
 index XXXXXXX..XXXXXXX 100644
 --- a/target/ppc/int_helper.c
 +++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
  uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
  {
 -    int64_t rt = 0;
 +    uint64_t rt = 0;
      int64_t ra = (int64_t)rau;
      int64_t rb = (int64_t)rbu;
      int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
      int cr;
      uint64_t lo_value;
      uint64_t hi_value;
 +    uint64_t rem;
      ppc_avr_t ret = { .u64 = { 0, 0 } };
      if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
           * In that case, we leave r unchanged.
           */
      } else {
 -        divu128(&lo_value, &hi_value, 1000000000000000ULL);
 +        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 -        for (i = 1; i < 16; hi_value /= 10, i++) {
 -            bcd_put_digit(&ret, hi_value % 10, i);
 +        for (i = 1; i < 16; rem /= 10, i++) {
 +            bcd_put_digit(&ret, rem % 10, i);
          }
          for (; i < 32; lo_value /= 10, i++) {
 diff --git a/util/host-utils.c b/util/host-utils.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/host-utils.c
 +++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
  }
  /*
 - * Unsigned 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Unsigned 128-by-64 division.
 + * Returns the remainder.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
  {
      uint64_t dhi = *phigh;
      uint64_t dlo = *plow;
 -    unsigned i;
 -    uint64_t carry = 0;
 +    uint64_t rem, dhighest;
 +    int sh;
      if (divisor == 0 || dhi == 0) {
          *plow  = dlo / divisor;
 -        *phigh = dlo % divisor;
 +        *phigh = 0;
 +        return dlo % divisor;
      } else {
 +        sh = clz64(divisor);
 -        for (i = 0; i < 64; i++) {
 -            carry = dhi >> 63;
 -            dhi = (dhi << 1) | (dlo >> 63);
 -            if (carry || (dhi >= divisor)) {
 -                dhi -= divisor;
 -                carry = 1;
 -            } else {
 -                carry = 0;
 +        if (dhi < divisor) {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
              }
 -            dlo = (dlo << 1) | carry;
 +
 +            *phigh = 0;
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
 +        } else {
 +            if (sh != 0) {
 +                /* normalize the divisor, shifting the dividend accordingly */
 +                divisor <<= sh;
 +                dhighest = dhi >> (64 - sh);
 +                dhi = (dhi << sh) | (dlo >> (64 - sh));
 +                dlo <<= sh;
 +
 +                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
 +            } else {
 +                /**
 +                 * dhi >= divisor
 +                 * Since the MSB of divisor is set (sh == 0),
 +                 * (dhi - divisor) < divisor
 +                 *
 +                 * Thus, the high part of the quotient is 1, and we can
 +                 * calculate the low part with a single call to udiv_qrnnd
 +                 * after subtracting divisor from dhi
 +                 */
 +                dhi -= divisor;
 +                *phigh = 1;
 +            }
 +
 +            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
          }
 -        *plow = dlo;
 -        *phigh = dhi;
 +        /*
 +         * since the dividend/divisor might have been normalized,
 +         * the remainder might also have to be shifted back
 +         */
 +        return rem >> sh;
      }
  }
  /*
 - * Signed 128-by-64 division. Returns quotient via plow and
 - * remainder via phigh.
 - * The result must fit in 64 bits (plow) - otherwise, the result
 - * is undefined.
 - * This function will cause a division by zero if passed a zero divisor.
 + * Signed 128-by-64 division.
 + * Returns quotient via plow and phigh.
 + * Also returns the remainder via the function return value.
   */
 -void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
  {
 -    int sgn_dvdnd = *phigh < 0;
 -    int sgn_divsr = divisor < 0;
 +    bool neg_quotient = false, neg_remainder = false;
 +    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
 +    uint64_t rem;
 -    if (sgn_dvdnd) {
 -        *plow = ~(*plow);
 -        *phigh = ~(*phigh);
 -        if (*plow == (int64_t)-1) {
 +    if (*phigh < 0) {
 +        neg_quotient = !neg_quotient;
 +        neg_remainder = !neg_remainder;
 +
 +        if (unsig_lo == 0) {
 +            unsig_hi = -unsig_hi;
 +        } else {
 +            unsig_hi = ~unsig_hi;
 +            unsig_lo = -unsig_lo;
 +        }
 +    }
 +
 +    if (divisor < 0) {
 +        neg_quotient = !neg_quotient;
 +
 +        divisor = -divisor;
 +    }
 +
 +    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
 +
 +    if (neg_quotient) {
 +        if (unsig_lo == 0) {
 +            *phigh = -unsig_hi;
              *plow = 0;
 -            (*phigh)++;
 -         } else {
 -            (*plow)++;
 -         }
 +        } else {
 +            *phigh = ~unsig_hi;
 +            *plow = -unsig_lo;
 +        }
 +    } else {
 +        *phigh = unsig_hi;
 +        *plow = unsig_lo;
      }
 -    if (sgn_divsr) {
 -        divisor = 0 - divisor;
 -    }
 -
 -    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 -
 -    if (sgn_dvdnd  ^ sgn_divsr) {
 -        *plow = 0 - *plow;
 +    if (neg_remainder) {
 +        return -rem;
 +    } else {
 +        return rem;
      }
  }
  #endif
 --
 .25.1

-New patch
+[PULL 05/56] host-utils: add unit tests for divu128/divs128
+From: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
  tests/unit/meson.build   |   1 +
 files changed, 198 insertions(+)
  create mode 100644 tests/unit/test-div128.c
 diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
 +/*
 + * Test 128-bit division functions
 + *
 + * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public
 + * License as published by the Free Software Foundation; either
 + * version 2.1 of the License, or (at your option) any later version.
 + *
 + * This library is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 + */
 +
 +#include "qemu/osdep.h"
 +#include "qemu/host-utils.h"
 +
 +typedef struct {
 +    uint64_t high;
 +    uint64_t low;
 +    uint64_t rhigh;
 +    uint64_t rlow;
 +    uint64_t divisor;
 +    uint64_t remainder;
 +} test_data_unsigned;
 +
 +typedef struct {
 +    int64_t high;
 +    uint64_t low;
 +    int64_t rhigh;
 +    uint64_t rlow;
 +    int64_t divisor;
 +    int64_t remainder;
 +} test_data_signed;
 +
 +static const test_data_unsigned test_table_unsigned[] = {
 +    /* Dividend fits in 64 bits */
 +    { 0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x0000000000000003ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x0000000000000002ULL, 0x0000000000000001ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x0000000000000000ULL, 0xa000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000002ULL,
 +      0x4000000000000000ULL, 0x2000000000000000ULL},
 +    { 0x0000000000000000ULL, 0x8000000000000000ULL,
 +      0x0000000000000000ULL, 0x0000000000000001ULL,
 +      0x8000000000000000ULL, 0x0000000000000000ULL},
 +
 +    /* Dividend > 64 bits, with MSB 0 */
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0000000000000001ULL, 0x000000000000000dULL,
 +      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
 +    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
 +      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
 +      0x0000000000000010ULL, 0x0000000000000001ULL},
 +
 +    /* Dividend > 64 bits, with MSB 1 */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
 +      0x0000000000000010ULL, 0x000000000000000fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
 +      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
 +
 +    /**
 +     * Divisor == 64 bits, with MSB 1
 +     * and high 64 bits of dividend >= divisor
 +     * (for testing normalization)
 +     */
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0x0000000000000000ULL,
 +      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
 +    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
 +      0x0000000000000001ULL, 0xfddbb9977553310aULL,
 +      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
 +
 +    /* Dividend > 64 bits, divisor almost as big */
 +    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
 +      0x0000000000000000ULL, 0x000000000000000fULL,
 +      0x123456789abcdefeULL, 0x123456789abcde1fULL},
 +};
 +
 +static const test_data_signed test_table_signed[] = {
 +    /* Positive dividend, positive/negative divisors */
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0x0000000000000008LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0xfffffffffffffff8LL, 0x0000000000000006LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0x0000000000000237LL, 0x0000000000000183LL},
 +    { 0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0xfffffffffffffdc9LL, 0x0000000000000183LL},
 +
 +    /* Negative dividend, positive/negative divisors */
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000001LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000bc614eULL,
 +      0xffffffffffffffffLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
 +      0x0000000000000002LL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x00000000005e30a7ULL,
 +      0xfffffffffffffffeLL, 0x0000000000000000LL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
 +      0x0000000000000008LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x0000000000178c29ULL,
 +      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
 +      0x0000000000000237LL, 0xfffffffffffffe7dLL},
 +    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
 +      0x0000000000000000LL, 0x000000000000550dULL,
 +      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
 +};
 +
 +static void test_divu128(void)
 +{
 +    int i;
 +    uint64_t rem;
 +    test_data_unsigned tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
 +        tmp = test_table_unsigned[i];
 +
 +        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +static void test_divs128(void)
 +{
 +    int i;
 +    int64_t rem;
 +    test_data_signed tmp;
 +
 +    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
 +        tmp = test_table_signed[i];
 +
 +        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
 +        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
 +        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
 +        g_assert_cmpuint(rem, ==, tmp.remainder);
 +    }
 +}
 +
 +int main(int argc, char **argv)
 +{
 +    g_test_init(&argc, &argv, NULL);
 +    g_test_add_func("/host-utils/test_divu128", test_divu128);
 +    g_test_add_func("/host-utils/test_divs128", test_divs128);
 +    return g_test_run();
 +}
 diff --git a/tests/unit/meson.build b/tests/unit/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/tests/unit/meson.build
 +++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
    # all code tested by test-x86-cpuid is inside topology.h
    'test-x86-cpuid': [],
    'test-cutils': [],
 +  'test-div128': [],
    'test-shift128': [],
    'test-mul64': [],
    # all code tested by test-int128 is inside int128.h
 --
 .25.1

-[PULL 15/16] cputlb: Pass retaddr to tb_invalidate_phys_page_fast
+[PULL 06/56] tcg/optimize: Rename "mask" to "z_mask"
-Rather than rely on cpu->mem_io_pc, pass retaddr down directly.
+Prepare for tracking different masks by renaming this one.
 Within tb_invalidate_phys_page_range__locked, the is_cpu_write_access
 parameter is non-zero exactly when retaddr would be non-zero, so that
 is a simple replacement.
 Recognize that current_tb_not_found is true only when mem_io_pc
 (and now retaddr) are also non-zero, so remove a redundant test.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.h |  3 ++-
+ tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
- accel/tcg/cputlb.c        |  6 +-----
+file changed, 72 insertions(+), 70 deletions(-)
  accel/tcg/translate-all.c | 39 +++++++++++++++++++--------------------
 files changed, 22 insertions(+), 26 deletions(-)
-diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.h
+--- a/tcg/optimize.c
-+++ b/accel/tcg/translate-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct page_collection *page_collection_lock(tb_page_addr_t start,
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
-                                              tb_page_addr_t end);
+     TCGTemp *prev_copy;
- void page_collection_unlock(struct page_collection *set);
+     TCGTemp *next_copy;
- void tb_invalidate_phys_page_fast(struct page_collection *pages,
+     uint64_t val;
--                                  tb_page_addr_t start, int len);
+-    uint64_t mask;
-+                                  tb_page_addr_t start, int len,
++    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
-+                                  uintptr_t retaddr);
+ } TempOptInfo;
- void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
- void tb_check_watchpoint(CPUState *cpu);
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+     ti->next_copy = ts;
-index XXXXXXX..XXXXXXX 100644
+     ti->prev_copy = ts;
---- a/accel/tcg/cputlb.c
+     ti->is_const = false;
-+++ b/accel/tcg/cputlb.c
+-    ti->mask = -1;
-@@ -XXX,XX +XXX,XX @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
++    ti->z_mask = -1;
-     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
+ }
-         struct page_collection *pages
-             = page_collection_lock(ram_addr, ram_addr + size);
+ static void reset_temp(TCGArg arg)
--
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
--        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
+     if (ts->kind == TEMP_CONST) {
--        cpu->mem_io_pc = retaddr;
+         ti->is_const = true;
--
+         ti->val = ts->val;
--        tb_invalidate_phys_page_fast(pages, ram_addr, size);
+-        ti->mask = ts->val;
-+        tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
++        ti->z_mask = ts->val;
-         page_collection_unlock(pages);
+         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-     }
+             /* High bits of a 32-bit quantity are garbage.  */
+-            ti->mask |= ~0xffffffffull;
-diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
++            ti->z_mask |= ~0xffffffffull;
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void
  tb_invalidate_phys_page_range__locked(struct page_collection *pages,
                                        PageDesc *p, tb_page_addr_t start,
                                        tb_page_addr_t end,
 -                                      int is_cpu_write_access)
 +                                      uintptr_t retaddr)
  {
      TranslationBlock *tb;
      tb_page_addr_t tb_start, tb_end;
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
  #ifdef TARGET_HAS_PRECISE_SMC
      CPUState *cpu = current_cpu;
      CPUArchState *env = NULL;
 -    int current_tb_not_found = is_cpu_write_access;
 +    bool current_tb_not_found = retaddr != 0;
 +    bool current_tb_modified = false;
      TranslationBlock *current_tb = NULL;
 -    int current_tb_modified = 0;
      target_ulong current_pc = 0;
      target_ulong current_cs_base = 0;
      uint32_t current_flags = 0;
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
          if (!(tb_end <= start || tb_start >= end)) {
  #ifdef TARGET_HAS_PRECISE_SMC
              if (current_tb_not_found) {
 -                current_tb_not_found = 0;
 -                current_tb = NULL;
 -                if (cpu->mem_io_pc) {
 -                    /* now we have a real cpu fault */
 -                    current_tb = tcg_tb_lookup(cpu->mem_io_pc);
 -                }
 +                current_tb_not_found = false;
 +                /* now we have a real cpu fault */
 +                current_tb = tcg_tb_lookup(retaddr);
              }
              if (current_tb == tb &&
                  (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
 -                /* If we are modifying the current TB, we must stop
 -                its execution. We could be more precise by checking
 -                that the modification is after the current PC, but it
 -                would require a specialized function to partially
 -                restore the CPU state */
 -
 -                current_tb_modified = 1;
 -                cpu_restore_state_from_tb(cpu, current_tb,
 -                                          cpu->mem_io_pc, true);
 +                /*
 +                 * If we are modifying the current TB, we must stop
 +                 * its execution. We could be more precise by checking
 +                 * that the modification is after the current PC, but it
 +                 * would require a specialized function to partially
 +                 * restore the CPU state.
 +                 */
 +                current_tb_modified = true;
 +                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
                  cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
                                       &current_flags);
              }
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_range(target_ulong start, target_ulong end)
   * Call with all @pages in the range [@start, @start + len[ locked.
   */
  void tb_invalidate_phys_page_fast(struct page_collection *pages,
 -                                  tb_page_addr_t start, int len)
 +                                  tb_page_addr_t start, int len,
 +                                  uintptr_t retaddr)
  {
      PageDesc *p;
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
          }
      } else {
-     do_invalidate:
+         ti->is_const = false;
--        tb_invalidate_phys_page_range__locked(pages, p, start, start + len, 1);
+-        ti->mask = -1;
-+        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
++        ti->z_mask = -1;
 +                                              retaddr);
      }
  }
- #else
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
 -    uint64_t mask;
 +    uint64_t z_mask;
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[0] = dst;
      op->args[1] = src;
 -    mask = si->mask;
 +    z_mask = si->z_mask;
      if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
          /* High bits of the destination are now garbage.  */
 -        mask |= ~0xffffffffull;
 +        z_mask |= ~0xffffffffull;
      }
 -    di->mask = mask;
 +    di->z_mask = z_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t mask, partmask, affected, tmp;
 +        uint64_t z_mask, partmask, affected, tmp;
          int nb_oargs, nb_iargs;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          /* Simplify using known-zero bits. Currently only ops with a single
             output argument is supported. */
 -        mask = -1;
 +        z_mask = -1;
          affected = -1;
          switch (opc) {
          CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          CASE_OP_32_64(ext8u):
 -            mask = 0xff;
 +            z_mask = 0xff;
              goto and_const;
          CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          CASE_OP_32_64(ext16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              goto and_const;
          case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          case INDEX_op_ext32u_i64:
 -            mask = 0xffffffffU;
 +            z_mask = 0xffffffffU;
              goto and_const;
          CASE_OP_32_64(and):
 -            mask = arg_info(op->args[2])->mask;
 +            z_mask = arg_info(op->args[2])->z_mask;
              if (arg_is_const(op->args[2])) {
          and_const:
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
 -            mask = arg_info(op->args[1])->mask & mask;
 +            z_mask = arg_info(op->args[1])->z_mask & z_mask;
              break;
          case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
 +            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                  break;
              }
              QEMU_FALLTHROUGH;
          case INDEX_op_extu_i32_i64:
              /* We do not compute affected as it is a size changing op.  */
 -            mask = (uint32_t)arg_info(op->args[1])->mask;
 +            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
              break;
          CASE_OP_32_64(andc):
              /* Known-zeros does not imply known-ones.  Therefore unless
                 op->args[2] is constant, we can't infer anything from it.  */
              if (arg_is_const(op->args[2])) {
 -                mask = ~arg_info(op->args[2])->mask;
 +                z_mask = ~arg_info(op->args[2])->z_mask;
                  goto and_const;
              }
              /* But we certainly know nothing outside args[1] may be set. */
 -            mask = arg_info(op->args[1])->mask;
 +            z_mask = arg_info(op->args[1])->z_mask;
              break;
          case INDEX_op_sar_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_sar_i64:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 63;
 -                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i32:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 31;
 -                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_shr_i64:
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & 63;
 -                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
 +                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
              }
              break;
          case INDEX_op_extrl_i64_i32:
 -            mask = (uint32_t)arg_info(op->args[1])->mask;
 +            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
              break;
          case INDEX_op_extrh_i64_i32:
 -            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
 +            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
              break;
          CASE_OP_32_64(shl):
              if (arg_is_const(op->args[2])) {
                  tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                mask = arg_info(op->args[1])->mask << tmp;
 +                z_mask = arg_info(op->args[1])->z_mask << tmp;
              }
              break;
          CASE_OP_32_64(neg):
              /* Set to 1 all bits to the left of the rightmost.  */
 -            mask = -(arg_info(op->args[1])->mask
 -                     & -arg_info(op->args[1])->mask);
 +            z_mask = -(arg_info(op->args[1])->z_mask
 +                       & -arg_info(op->args[1])->z_mask);
              break;
          CASE_OP_32_64(deposit):
 -            mask = deposit64(arg_info(op->args[1])->mask,
 -                             op->args[3], op->args[4],
 -                             arg_info(op->args[2])->mask);
 +            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                               op->args[3], op->args[4],
 +                               arg_info(op->args[2])->z_mask);
              break;
          CASE_OP_32_64(extract):
 -            mask = extract64(arg_info(op->args[1])->mask,
 -                             op->args[2], op->args[3]);
 +            z_mask = extract64(arg_info(op->args[1])->z_mask,
 +                               op->args[2], op->args[3]);
              if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
              break;
          CASE_OP_32_64(sextract):
 -            mask = sextract64(arg_info(op->args[1])->mask,
 -                              op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
 -                affected = arg_info(op->args[1])->mask & ~mask;
 +            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 +                                op->args[2], op->args[3]);
 +            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 +                affected = arg_info(op->args[1])->z_mask & ~z_mask;
              }
              break;
          CASE_OP_32_64(or):
          CASE_OP_32_64(xor):
 -            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
 +            z_mask = arg_info(op->args[1])->z_mask
 +                   | arg_info(op->args[2])->z_mask;
              break;
          case INDEX_op_clz_i32:
          case INDEX_op_ctz_i32:
 -            mask = arg_info(op->args[2])->mask | 31;
 +            z_mask = arg_info(op->args[2])->z_mask | 31;
              break;
          case INDEX_op_clz_i64:
          case INDEX_op_ctz_i64:
 -            mask = arg_info(op->args[2])->mask | 63;
 +            z_mask = arg_info(op->args[2])->z_mask | 63;
              break;
          case INDEX_op_ctpop_i32:
 -            mask = 32 | 31;
 +            z_mask = 32 | 31;
              break;
          case INDEX_op_ctpop_i64:
 -            mask = 64 | 63;
 +            z_mask = 64 | 63;
              break;
          CASE_OP_32_64(setcond):
          case INDEX_op_setcond2_i32:
 -            mask = 1;
 +            z_mask = 1;
              break;
          CASE_OP_32_64(movcond):
 -            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
 +            z_mask = arg_info(op->args[3])->z_mask
 +                   | arg_info(op->args[4])->z_mask;
              break;
          CASE_OP_32_64(ld8u):
 -            mask = 0xff;
 +            z_mask = 0xff;
              break;
          CASE_OP_32_64(ld16u):
 -            mask = 0xffff;
 +            z_mask = 0xffff;
              break;
          case INDEX_op_ld32u_i64:
 -            mask = 0xffffffffu;
 +            z_mask = 0xffffffffu;
              break;
          CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                  MemOp mop = get_memop(oi);
                  if (!(mop & MO_SIGN)) {
 -                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 +                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                  }
              }
              break;
          CASE_OP_32_64(bswap16):
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffff) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffff) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap16(mask);
 +            z_mask = bswap16(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int16_t)mask;
 +                z_mask = (int16_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(16, 48);
 +                z_mask |= MAKE_64BIT_MASK(16, 48);
                  break;
              }
              break;
          case INDEX_op_bswap32_i64:
 -            mask = arg_info(op->args[1])->mask;
 -            if (mask <= 0xffffffffu) {
 +            z_mask = arg_info(op->args[1])->z_mask;
 +            if (z_mask <= 0xffffffffu) {
                  op->args[2] |= TCG_BSWAP_IZ;
              }
 -            mask = bswap32(mask);
 +            z_mask = bswap32(z_mask);
              switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
              case TCG_BSWAP_OZ:
                  break;
              case TCG_BSWAP_OS:
 -                mask = (int32_t)mask;
 +                z_mask = (int32_t)z_mask;
                  break;
              default: /* undefined high bits */
 -                mask |= MAKE_64BIT_MASK(32, 32);
 +                z_mask |= MAKE_64BIT_MASK(32, 32);
                  break;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          /* 32-bit ops generate 32-bit results.  For the result is zero test
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
 -        partmask = mask;
 +        partmask = z_mask;
          if (!(def->flags & TCG_OPF_64BIT)) {
 -            mask |= ~(tcg_target_ulong)0xffffffffu;
 +            z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     vs the high word of the input.  */
              do_setcond_high:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[1] = op->args[2];
                  op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              do_setcond_low:
                  reset_temp(op->args[0]);
 -                arg_info(op->args[0])->mask = 1;
 +                arg_info(op->args[0])->z_mask = 1;
                  op->opc = INDEX_op_setcond_i32;
                  op->args[2] = op->args[3];
                  op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              /* Default case: we know nothing about operation (or were unable
                 to compute the operation result) so no propagation is done.
                 We trash everything if the operation is the end of a basic
 -               block, otherwise we only trash the output args.  "mask" is
 +               block, otherwise we only trash the output args.  "z_mask" is
                 the non-zero bits mask for the first output arg.  */
              if (def->flags & TCG_OPF_BB_END) {
                  memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Save the corresponding known-zero bits mask for the
                         first output argument (only one supported so far). */
                      if (i == 0) {
 -                        arg_info(op->args[i])->mask = mask;
 +                        arg_info(op->args[i])->z_mask = z_mask;
                      }
                  }
              }
 --
-.17.1
+.25.1

-New patch
+[PULL 07/56] tcg/optimize: Split out OptContext
+Provide what will become a larger context for splitting
+the very large tcg_optimize function.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
+file changed, 40 insertions(+), 37 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+ } TempOptInfo;
++typedef struct OptContext {
++    TCGTempSet temps_used;
++} OptContext;
++
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
+ {
+     return ts->state_ptr;
+@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
+ }
+ /* Initialize and activate a temporary.  */
+-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
++static void init_ts_info(OptContext *ctx, TCGTemp *ts)
+ {
+     size_t idx = temp_idx(ts);
+     TempOptInfo *ti;
+-    if (test_bit(idx, temps_used->l)) {
++    if (test_bit(idx, ctx->temps_used.l)) {
+         return;
+     }
+-    set_bit(idx, temps_used->l);
++    set_bit(idx, ctx->temps_used.l);
+     ti = ts->state_ptr;
+     if (ti == NULL) {
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+     }
+ }
+-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
++static void init_arg_info(OptContext *ctx, TCGArg arg)
+ {
+-    init_ts_info(temps_used, arg_temp(arg));
++    init_ts_info(ctx, arg_temp(arg));
+ }
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+     }
+ }
+-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
++static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
+                              TCGOp *op, TCGArg dst, uint64_t val)
+ {
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+     /* Convert movi to mov with constant temp. */
+     tv = tcg_constant_internal(type, val);
+-    init_ts_info(temps_used, tv);
++    init_ts_info(ctx, tv);
+     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+ }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+     TCGOp *op, *op_next, *prev_mb = NULL;
+-    TCGTempSet temps_used;
++    OptContext ctx = {};
+     /* Array VALS has an element for each temp.
+        If this temp holds a constant then its value is kept in VALS' element.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+     nb_temps = s->nb_temps;
+     nb_globals = s->nb_globals;
+-    memset(&temps_used, 0, sizeof(temps_used));
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+                 TCGTemp *ts = arg_temp(op->args[i]);
+                 if (ts) {
+-                    init_ts_info(&temps_used, ts);
++                    init_ts_info(&ctx, ts);
+                 }
+             }
+         } else {
+             nb_oargs = def->nb_oargs;
+             nb_iargs = def->nb_iargs;
+             for (i = 0; i < nb_oargs + nb_iargs; i++) {
+-                init_arg_info(&temps_used, op->args[i]);
++                init_arg_info(&ctx, op->args[i]);
+             }
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(rotr):
+             if (arg_is_const(op->args[1])
+                 && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (partmask == 0) {
+             tcg_debug_assert(nb_oargs == 1);
+-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+             continue;
+         }
+         if (affected == 0) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(mulsh):
+             if (arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+         CASE_OP_32_64_VEC(xor):
+             if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                 continue;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_extrh_i64_i32:
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 TCGArg v = arg_info(op->args[1])->val;
+                 if (v != 0) {
+                     tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = deposit64(arg_info(op->args[1])->val,
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                            op->args[1], op->args[2]);
+             if (tmp != 2) {
+                 if (tmp) {
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[3];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rl = op->args[0];
+                 rh = op->args[1];
+-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
++                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
++                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                 break;
+             }
+             goto do_default;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+                 if (tmp) {
+             do_brcond_true:
+-                    memset(&temps_used, 0, sizeof(temps_used));
++                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                     op->opc = INDEX_op_br;
+                     op->args[0] = op->args[5];
+                 } else {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     goto do_default;
+                 }
+             do_brcond_low:
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                 op->opc = INDEX_op_brcond_i32;
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                             op->args[5]);
+             if (tmp != 2) {
+             do_setcond_const:
+-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
++                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+             } else if ((op->args[5] == TCG_COND_LT
+                         || op->args[5] == TCG_COND_GE)
+                        && arg_is_const(op->args[3])
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (!(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, temps_used.l)) {
++                    if (test_bit(i, ctx.temps_used.l)) {
+                         reset_ts(&s->temps[i]);
+                     }
+                 }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                block, otherwise we only trash the output args.  "z_mask" is
+                the non-zero bits mask for the first output arg.  */
+             if (def->flags & TCG_OPF_BB_END) {
+-                memset(&temps_used, 0, sizeof(temps_used));
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+             } else {
+         do_reset_output:
+                 for (i = 0; i < nb_oargs; i++) {
+--
+.25.1

-New patch
+[PULL 08/56] tcg/optimize: Remove do_default label
+Break the final cleanup clause out of the main switch
+statement.  When fully folding an opcode to mov/movi,
+use "continue" to process the next opcode, else break
+to fall into the final cleanup.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
+file changed, 94 insertions(+), 96 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+-            break;
++            continue;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+                 tmp = arg_info(op->args[1])->val;
+                 tmp = dup_const(TCGOP_VECE(op), tmp);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_dup2_vec:
+             assert(TCG_TARGET_REG_BITS == 32);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                                  deposit64(arg_info(op->args[1])->val, 32, 32,
+                                            arg_info(op->args[2])->val));
+-                break;
++                continue;
+             } else if (args_are_copies(op->args[1], op->args[2])) {
+                 op->opc = INDEX_op_dup_vec;
+                 TCGOP_VECE(op) = MO_32;
+                 nb_iargs = 1;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(not):
+         CASE_OP_32_64(neg):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (arg_is_const(op->args[1])) {
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(bswap16):
+         CASE_OP_32_64(bswap32):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           op->args[2]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(add):
+         CASE_OP_32_64(sub):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+                                           arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else {
+                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                 }
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(deposit):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                 op->args[3], op->args[4],
+                                 arg_info(op->args[2])->val);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = extract64(arg_info(op->args[1])->val,
+                                 op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(sextract):
+             if (arg_is_const(op->args[1])) {
+                 tmp = sextract64(arg_info(op->args[1])->val,
+                                  op->args[2], op->args[3]);
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(extract2):
+             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                     ((uint32_t)v2 << (32 - shr)));
+                 }
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(setcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[3]);
+             if (tmp != 2) {
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(brcond):
+             tmp = do_constant_folding_cond(opc, op->args[0],
+                                            op->args[1], op->args[2]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[3];
+-                } else {
+-                    tcg_op_remove(s, op);
+-                }
++            switch (tmp) {
++            case 0:
++                tcg_op_remove(s, op);
++                continue;
++            case 1:
++                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[3];
+                 break;
+             }
+-            goto do_default;
++            break;
+         CASE_OP_32_64(movcond):
+             tmp = do_constant_folding_cond(opc, op->args[1],
+                                            op->args[2], op->args[5]);
+             if (tmp != 2) {
+                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+-                break;
++                continue;
+             }
+             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+                 uint64_t tv = arg_info(op->args[3])->val;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (fv == 1 && tv == 0) {
+                     cond = tcg_invert_cond(cond);
+                 } else if (!(tv == 1 && fv == 0)) {
+-                    goto do_default;
++                    break;
+                 }
+                 op->args[3] = cond;
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                                  : INDEX_op_setcond_i64);
+                 nb_iargs = 2;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_add2_i32:
+         case INDEX_op_sub2_i32:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_mulu2_i32:
+             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 rh = op->args[1];
+                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+-                break;
++                continue;
+             }
+-            goto do_default;
++            break;
+         case INDEX_op_brcond2_i32:
+             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                             op->args[4]);
+-            if (tmp != 2) {
+-                if (tmp) {
+-            do_brcond_true:
+-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                    op->opc = INDEX_op_br;
+-                    op->args[0] = op->args[5];
+-                } else {
++            if (tmp == 0) {
+             do_brcond_false:
+-                    tcg_op_remove(s, op);
+-                }
+-            } else if ((op->args[4] == TCG_COND_LT
+-                        || op->args[4] == TCG_COND_GE)
+-                       && arg_is_const(op->args[2])
+-                       && arg_info(op->args[2])->val == 0
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0) {
++                tcg_op_remove(s, op);
++                continue;
++            }
++            if (tmp == 1) {
++            do_brcond_true:
++                op->opc = opc = INDEX_op_br;
++                op->args[0] = op->args[5];
++                break;
++            }
++            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
++                 && arg_is_const(op->args[2])
++                 && arg_info(op->args[2])->val == 0
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_brcond_high:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
++                op->opc = opc = INDEX_op_brcond_i32;
+                 op->args[0] = op->args[1];
+                 op->args[1] = op->args[3];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_brcond_false;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_brcond_low:
+                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[4] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[4] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_brcond_true;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             if (tmp != 2) {
+             do_setcond_const:
+                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+-            } else if ((op->args[5] == TCG_COND_LT
+-                        || op->args[5] == TCG_COND_GE)
+-                       && arg_is_const(op->args[3])
+-                       && arg_info(op->args[3])->val == 0
+-                       && arg_is_const(op->args[4])
+-                       && arg_info(op->args[4])->val == 0) {
++                continue;
++            }
++            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
++                 && arg_is_const(op->args[3])
++                 && arg_info(op->args[3])->val == 0
++                 && arg_is_const(op->args[4])
++                 && arg_info(op->args[4])->val == 0) {
+                 /* Simplify LT/GE comparisons vs zero to a single compare
+                    vs the high word of the input.  */
+             do_setcond_high:
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->args[1] = op->args[2];
+                 op->args[2] = op->args[4];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_EQ) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_EQ) {
+                 /* Simplify EQ comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 if (tmp == 0) {
+                     goto do_setcond_high;
+                 } else if (tmp != 1) {
+-                    goto do_default;
++                    break;
+                 }
+             do_setcond_low:
+                 reset_temp(op->args[0]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 op->opc = INDEX_op_setcond_i32;
+                 op->args[2] = op->args[3];
+                 op->args[3] = op->args[5];
+-            } else if (op->args[5] == TCG_COND_NE) {
++                break;
++            }
++            if (op->args[5] == TCG_COND_NE) {
+                 /* Simplify NE comparisons where one of the pairs
+                    can be simplified.  */
+                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 } else if (tmp == 1) {
+                     goto do_setcond_const;
+                 }
+-                goto do_default;
+-            } else {
+-                goto do_default;
+             }
+             break;
+-        case INDEX_op_call:
+-            if (!(tcg_call_flags(op)
++        default:
++            break;
++        }
++
++        /* Some of the folding above can change opc. */
++        opc = op->opc;
++        def = &tcg_op_defs[opc];
++        if (def->flags & TCG_OPF_BB_END) {
++            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
++        } else {
++            if (opc == INDEX_op_call &&
++                !(tcg_call_flags(op)
+                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+                 for (i = 0; i < nb_globals; i++) {
+                     if (test_bit(i, ctx.temps_used.l)) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                     }
+                 }
+             }
+-            goto do_reset_output;
+-        default:
+-        do_default:
+-            /* Default case: we know nothing about operation (or were unable
+-               to compute the operation result) so no propagation is done.
+-               We trash everything if the operation is the end of a basic
+-               block, otherwise we only trash the output args.  "z_mask" is
+-               the non-zero bits mask for the first output arg.  */
+-            if (def->flags & TCG_OPF_BB_END) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-            } else {
+-        do_reset_output:
+-                for (i = 0; i < nb_oargs; i++) {
+-                    reset_temp(op->args[i]);
+-                    /* Save the corresponding known-zero bits mask for the
+-                       first output argument (only one supported so far). */
+-                    if (i == 0) {
+-                        arg_info(op->args[i])->z_mask = z_mask;
+-                    }
++            for (i = 0; i < nb_oargs; i++) {
++                reset_temp(op->args[i]);
++                /* Save the corresponding known-zero bits mask for the
++                   first output argument (only one supported so far). */
++                if (i == 0) {
++                    arg_info(op->args[i])->z_mask = z_mask;
+                 }
+             }
+-            break;
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+--
+.25.1

-New patch
+[PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
+Adjust the interface to take the OptContext parameter instead
 of TCGContext or both.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 file changed, 34 insertions(+), 33 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
  } TempOptInfo;
  typedef struct OptContext {
 +    TCGContext *tcg;
      TCGTempSet temps_used;
  } OptContext;
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
      return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
  }
 -static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
 +static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      TCGOpcode new_op;
      if (ts_are_copies(dst_ts, src_ts)) {
 -        tcg_op_remove(s, op);
 +        tcg_op_remove(ctx->tcg, op);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
      }
  }
 -static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
 -                             TCGOp *op, TCGArg dst, uint64_t val)
 +static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
 +                             TCGArg dst, uint64_t val)
  {
      const TCGOpDef *def = &tcg_op_defs[op->opc];
      TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
      /* Convert movi to mov with constant temp. */
      tv = tcg_constant_internal(type, val);
      init_ts_info(ctx, tv);
 -    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 +    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
  {
      int nb_temps, nb_globals, i;
      TCGOp *op, *op_next, *prev_mb = NULL;
 -    OptContext ctx = {};
 +    OptContext ctx = { .tcg = s };
      /* Array VALS has an element for each temp.
         If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(rotr):
              if (arg_is_const(op->args[1])
                  && arg_info(op->args[1])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (!arg_is_const(op->args[1])
                  && arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == -1) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          if (partmask == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
              continue;
          }
          if (affected == 0) {
              tcg_debug_assert(nb_oargs == 1);
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64(mulsh):
              if (arg_is_const(op->args[2])
                  && arg_info(op->args[2])->val == 0) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(or):
          CASE_OP_32_64_VEC(and):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          CASE_OP_32_64_VEC(sub):
          CASE_OP_32_64_VEC(xor):
              if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             allocator where needed and possible.  Also detect copies. */
          switch (opc) {
          CASE_OP_32_64_VEC(mov):
 -            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
 +            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
              continue;
          case INDEX_op_dup_vec:
              if (arg_is_const(op->args[1])) {
                  tmp = arg_info(op->args[1])->val;
                  tmp = dup_const(TCGOP_VECE(op), tmp);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_dup2_vec:
              assert(TCG_TARGET_REG_BITS == 32);
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
 +                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                   deposit64(arg_info(op->args[1])->val, 32, 32,
                                             arg_info(op->args[2])->val));
                  continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            op->args[2]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                  tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                            arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGArg v = arg_info(op->args[1])->val;
                  if (v != 0) {
                      tmp = do_constant_folding(opc, v, 0);
 -                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  } else {
 -                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
 +                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                  }
                  continue;
              }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  tmp = deposit64(arg_info(op->args[1])->val,
                                  op->args[3], op->args[4],
                                  arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = extract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (arg_is_const(op->args[1])) {
                  tmp = sextract64(arg_info(op->args[1])->val,
                                   op->args[2], op->args[3]);
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                      ((uint32_t)v2 << (32 - shr)));
                  }
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[3]);
              if (tmp != 2) {
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              tmp = do_constant_folding_cond(opc, op->args[1],
                                             op->args[2], op->args[5]);
              if (tmp != 2) {
 -                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  rl = op->args[0];
                  rh = op->args[1];
 -                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
 -                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
 +                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
 +                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                  continue;
              }
              break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                              op->args[5]);
              if (tmp != 2) {
              do_setcond_const:
 -                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
 --
 .25.1

-New patch
+[PULL 10/56] tcg/optimize: Move prev_mb into OptContext
+This will expose the variable to subroutines that
+will be broken out of tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 11 ++++++-----
+file changed, 6 insertions(+), 5 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
+ typedef struct OptContext {
+     TCGContext *tcg;
++    TCGOp *prev_mb;
+     TCGTempSet temps_used;
+ } OptContext;
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
+ void tcg_optimize(TCGContext *s)
+ {
+     int nb_temps, nb_globals, i;
+-    TCGOp *op, *op_next, *prev_mb = NULL;
++    TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+     /* Array VALS has an element for each temp.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         }
+         /* Eliminate duplicate and redundant fence instructions.  */
+-        if (prev_mb) {
++        if (ctx.prev_mb) {
+             switch (opc) {
+             case INDEX_op_mb:
+                 /* Merge two barriers of the same type into one,
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                  * barrier.  This is stricter than specified but for
+                  * the purposes of TCG is better than not optimizing.
+                  */
+-                prev_mb->args[0] |= op->args[0];
++                ctx.prev_mb->args[0] |= op->args[0];
+                 tcg_op_remove(s, op);
+                 break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i64:
+             case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+-                prev_mb = NULL;
++                ctx.prev_mb = NULL;
+                 break;
+             }
+         } else if (opc == INDEX_op_mb) {
+-            prev_mb = op;
++            ctx.prev_mb = op;
+         }
+     }
+ }
+--
+.25.1

-[PULL 09/16] cputlb: Move NOTDIRTY handling from I/O path to TLB path
+[PULL 11/56] tcg/optimize: Split out init_arguments
-Pages that we want to track for NOTDIRTY are RAM.  We do not
+There was no real reason for calls to have separate code here.
-really need to go through the I/O path to handle them.
+Unify init for calls vs non-calls using the call path, which
 handles TCG_CALL_DUMMY_ARG.
-Acked-by: David Hildenbrand <david@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-common.h |  2 --
+ tcg/optimize.c | 25 +++++++++++--------------
- accel/tcg/cputlb.c        | 26 +++++++++++++++++---
+file changed, 11 insertions(+), 14 deletions(-)
  exec.c                    | 50 ---------------------------------------
  memory.c                  | 16 -------------
 files changed, 23 insertions(+), 71 deletions(-)
-diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-common.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-common.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
  void cpu_flush_icache_range(hwaddr start, hwaddr len);
 -extern struct MemoryRegion io_mem_notdirty;
 -
  typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
  int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
      cpu->mem_io_pc = retaddr;
 -    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
 +    if (!cpu->can_do_io) {
          cpu_io_recompile(cpu, retaddr);
      }
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
 -    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
 +    if (!cpu->can_do_io) {
          cpu_io_recompile(cpu, retaddr);
      }
      cpu->mem_io_vaddr = addr;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
          need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
          /* Handle I/O access.  */
 -        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
 +        if (tlb_addr & TLB_MMIO) {
              io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
                        op ^ (need_swap * MO_BSWAP));
              return;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
          haddr = (void *)((uintptr_t)addr + entry->addend);
 +        /* Handle clean RAM pages.  */
 +        if (tlb_addr & TLB_NOTDIRTY) {
 +            NotDirtyInfo ndi;
 +
 +            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
 +            env_cpu(env)->mem_io_pc = retaddr;
 +
 +            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
 +                                          addr + iotlbentry->addr, size);
 +
 +            if (unlikely(need_swap)) {
 +                store_memop(haddr, val, op ^ MO_BSWAP);
 +            } else {
 +                store_memop(haddr, val, op);
 +            }
 +
 +            memory_notdirty_write_complete(&ndi);
 +            return;
 +        }
 +
          /*
           * Keep these two store_memop separate to ensure that the compiler
           * is able to fold the entire function to a single instruction.
 diff --git a/exec.c b/exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/exec.c
 +++ b/exec.c
@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
  AddressSpace address_space_io;
  AddressSpace address_space_memory;
 -MemoryRegion io_mem_notdirty;
  static MemoryRegion io_mem_unassigned;
  #endif
@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
  } subpage_t;
  #define PHYS_SECTION_UNASSIGNED 0
 -#define PHYS_SECTION_NOTDIRTY 1
  static void io_mem_init(void);
  static void memory_map_init(void);
@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
      if (memory_region_is_ram(section->mr)) {
          /* Normal RAM.  */
          iotlb = memory_region_get_ram_addr(section->mr) + xlat;
 -        if (!section->readonly) {
 -            iotlb |= PHYS_SECTION_NOTDIRTY;
 -        }
      } else {
          AddressSpaceDispatch *d;
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
      }
  }
--/* Called within RCU critical section.  */
+-static void init_arg_info(OptContext *ctx, TCGArg arg)
 -static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
 -                               uint64_t val, unsigned size)
 -{
--    NotDirtyInfo ndi;
+-    init_ts_info(ctx, arg_temp(arg));
 -
 -    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
 -                         ram_addr, size);
 -
 -    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
 -    memory_notdirty_write_complete(&ndi);
 -}
 -
--static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
+ static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 -                                 unsigned size, bool is_write,
 -                                 MemTxAttrs attrs)
 -{
 -    return is_write;
 -}
 -
 -static const MemoryRegionOps notdirty_mem_ops = {
 -    .write = notdirty_mem_write,
 -    .valid.accepts = notdirty_mem_accepts,
 -    .endianness = DEVICE_NATIVE_ENDIAN,
 -    .valid = {
 -        .min_access_size = 1,
 -        .max_access_size = 8,
 -        .unaligned = false,
 -    },
 -    .impl = {
 -        .min_access_size = 1,
 -        .max_access_size = 8,
 -        .unaligned = false,
 -    },
 -};
 -
  /* Generate a debug exception if a watchpoint has been hit.  */
  void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                            MemTxAttrs attrs, int flags, uintptr_t ra)
@@ -XXX,XX +XXX,XX @@ static void io_mem_init(void)
  {
-     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
+     TCGTemp *i, *g, *l;
-                           NULL, UINT64_MAX);
+@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
--
+     return false;
 -    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
 -     * which can be called without the iothread mutex.
 -     */
 -    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
 -                          NULL, UINT64_MAX);
 -    memory_region_clear_global_locking(&io_mem_notdirty);
  }
- AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
++static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
-@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
++{
++    for (int i = 0; i < nb_args; i++) {
-     n = dummy_section(&d->map, fv, &io_mem_unassigned);
++        TCGTemp *ts = arg_temp(op->args[i]);
-     assert(n == PHYS_SECTION_UNASSIGNED);
++        if (ts) {
--    n = dummy_section(&d->map, fv, &io_mem_notdirty);
++            init_ts_info(ctx, ts);
--    assert(n == PHYS_SECTION_NOTDIRTY);
++        }
++    }
-     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
++}
++
-diff --git a/memory.c b/memory.c
+ /* Propagate constants and copies, fold constant expressions. */
-index XXXXXXX..XXXXXXX 100644
+ void tcg_optimize(TCGContext *s)
---- a/memory.c
+ {
-+++ b/memory.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
+         if (opc == INDEX_op_call) {
-     tmp = mr->ops->read(mr->opaque, addr, size);
+             nb_oargs = TCGOP_CALLO(op);
-     if (mr->subpage) {
+             nb_iargs = TCGOP_CALLI(op);
-         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
--    } else if (mr == &io_mem_notdirty) {
+-                TCGTemp *ts = arg_temp(op->args[i]);
--        /* Accesses to code which has previously been translated into a TB show
+-                if (ts) {
--         * up in the MMIO path, as accesses to the io_mem_notdirty
+-                    init_ts_info(&ctx, ts);
--         * MemoryRegion. */
+-                }
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
+-            }
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
+         } else {
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
+             nb_oargs = def->nb_oargs;
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
+             nb_iargs = def->nb_iargs;
-     r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
+-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-     if (mr->subpage) {
+-                init_arg_info(&ctx, op->args[i]);
-         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
+-            }
--    } else if (mr == &io_mem_notdirty) {
+         }
--        /* Accesses to code which has previously been translated into a TB show
++        init_arguments(&ctx, op, nb_oargs + nb_iargs);
--         * up in the MMIO path, as accesses to the io_mem_notdirty
--         * MemoryRegion. */
+         /* Do copy propagation */
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
+         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
          hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
          trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
      if (mr->subpage) {
          trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
 -    } else if (mr == &io_mem_notdirty) {
 -        /* Accesses to code which has previously been translated into a TB show
 -         * up in the MMIO path, as accesses to the io_mem_notdirty
 -         * MemoryRegion. */
      } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
          hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
          trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
      if (mr->subpage) {
          trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
 -    } else if (mr == &io_mem_notdirty) {
 -        /* Accesses to code which has previously been translated into a TB show
 -         * up in the MMIO path, as accesses to the io_mem_notdirty
 -         * MemoryRegion. */
      } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
          hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
          trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
 --
-.17.1
+.25.1

-New patch
+[PULL 12/56] tcg/optimize: Split out copy_propagate
+Continue splitting tcg_optimize.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 22 ++++++++++++++--------
+file changed, 14 insertions(+), 8 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+     }
+ }
++static void copy_propagate(OptContext *ctx, TCGOp *op,
++                           int nb_oargs, int nb_iargs)
++{
++    TCGContext *s = ctx->tcg;
++
++    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
++        TCGTemp *ts = arg_temp(op->args[i]);
++        if (ts && ts_is_copy(ts)) {
++            op->args[i] = temp_arg(find_better_copy(s, ts));
++        }
++    }
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             nb_iargs = def->nb_iargs;
+         }
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+-
+-        /* Do copy propagation */
+-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+-            TCGTemp *ts = arg_temp(op->args[i]);
+-            if (ts && ts_is_copy(ts)) {
+-                op->args[i] = temp_arg(find_better_copy(s, ts));
+-            }
+-        }
++        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+         /* For commutative operations make constant second argument */
+         switch (opc) {
+--
+.25.1

-New patch
+[PULL 13/56] tcg/optimize: Split out fold_call
+Calls are special in that they have a variable number
+of arguments, and need to be able to clobber globals.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
+file changed, 41 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
+     }
+ }
++static bool fold_call(OptContext *ctx, TCGOp *op)
++{
++    TCGContext *s = ctx->tcg;
++    int nb_oargs = TCGOP_CALLO(op);
++    int nb_iargs = TCGOP_CALLI(op);
++    int flags, i;
++
++    init_arguments(ctx, op, nb_oargs + nb_iargs);
++    copy_propagate(ctx, op, nb_oargs, nb_iargs);
++
++    /* If the function reads or writes globals, reset temp data. */
++    flags = tcg_call_flags(op);
++    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
++        int nb_globals = s->nb_globals;
++
++        for (i = 0; i < nb_globals; i++) {
++            if (test_bit(i, ctx->temps_used.l)) {
++                reset_ts(&ctx->tcg->temps[i]);
++            }
++        }
++    }
++
++    /* Reset temp data for outputs. */
++    for (i = 0; i < nb_oargs; i++) {
++        reset_temp(op->args[i]);
++    }
++
++    /* Stop optimizing MB across calls. */
++    ctx->prev_mb = NULL;
++    return true;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+-    int nb_temps, nb_globals, i;
++    int nb_temps, i;
+     TCGOp *op, *op_next;
+     OptContext ctx = { .tcg = s };
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+        available through the doubly linked circular list. */
+     nb_temps = s->nb_temps;
+-    nb_globals = s->nb_globals;
+-
+     for (i = 0; i < nb_temps; ++i) {
+         s->temps[i].state_ptr = NULL;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         int nb_oargs, nb_iargs;
+         TCGOpcode opc = op->opc;
+-        const TCGOpDef *def = &tcg_op_defs[opc];
++        const TCGOpDef *def;
+-        /* Count the arguments, and initialize the temps that are
+-           going to be used */
++        /* Calls are special. */
+         if (opc == INDEX_op_call) {
+-            nb_oargs = TCGOP_CALLO(op);
+-            nb_iargs = TCGOP_CALLI(op);
+-        } else {
+-            nb_oargs = def->nb_oargs;
+-            nb_iargs = def->nb_iargs;
++            fold_call(&ctx, op);
++            continue;
+         }
++
++        def = &tcg_op_defs[opc];
++        nb_oargs = def->nb_oargs;
++        nb_iargs = def->nb_iargs;
+         init_arguments(&ctx, op, nb_oargs + nb_iargs);
+         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+         } else {
+-            if (opc == INDEX_op_call &&
+-                !(tcg_call_flags(op)
+-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+-                for (i = 0; i < nb_globals; i++) {
+-                    if (test_bit(i, ctx.temps_used.l)) {
+-                        reset_ts(&s->temps[i]);
+-                    }
+-                }
+-            }
+-
+             for (i = 0; i < nb_oargs; i++) {
+                 reset_temp(op->args[i]);
+                 /* Save the corresponding known-zero bits mask for the
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             case INDEX_op_qemu_st_i32:
+             case INDEX_op_qemu_st8_i32:
+             case INDEX_op_qemu_st_i64:
+-            case INDEX_op_call:
+                 /* Opcodes that touch guest memory stop the optimization.  */
+                 ctx.prev_mb = NULL;
+                 break;
+--
+.25.1

-[PULL 06/16] cputlb: Introduce TLB_BSWAP
+[PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals
-Handle bswap on ram directly in load/store_helper.  This fixes a
+Rather than try to keep these up-to-date across folding,
-bug with the previous implementation in that one cannot use the
+re-read nb_oargs at the end, after re-reading the opcode.
 I/O path for RAM.
-Fixes: a26fc6f5152b47f1
+A couple of asserts need dropping, but that will take care
 of itself as we split the function further.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h |  4 ++-
+ tcg/optimize.c | 14 ++++----------
- accel/tcg/cputlb.c     | 72 +++++++++++++++++++++++++-----------------
+file changed, 4 insertions(+), 10 deletions(-)
 files changed, 46 insertions(+), 30 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- #define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
- /* Set if TLB entry contains a watchpoint.  */
+     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
- #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
+         uint64_t z_mask, partmask, affected, tmp;
-+/* Set if TLB entry requires byte swap.  */
+-        int nb_oargs, nb_iargs;
-+#define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
- /* Use this mask to check interception with an alignment mask
-  * in a TCG backend.
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
   */
  #define TLB_FLAGS_MASK \
 -    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT)
 +    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
  /**
   * tlb_hit_page: return true if page aligned @addr is a hit against the
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
          address |= TLB_INVALID_MASK;
      }
      if (attrs.byte_swap) {
 -        /* Force the access through the I/O slow path.  */
 -        address |= TLB_MMIO;
 +        address |= TLB_BSWAP;
      }
      if (!memory_region_is_ram(section->mr) &&
          !memory_region_is_romd(section->mr)) {
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      bool locked = false;
      MemTxResult r;
 -    if (iotlbentry->attrs.byte_swap) {
 -        op ^= MO_BSWAP;
 -    }
 -
      section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      bool locked = false;
      MemTxResult r;
 -    if (iotlbentry->attrs.byte_swap) {
 -        op ^= MO_BSWAP;
 -    }
 -
      section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
                               wp_access, retaddr);
      }
 -    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO)) {
 -        /* I/O access */
 +    /* Reject I/O access, or other required slow-path.  */
 +    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
          return NULL;
      }
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
      /* Handle anything that isn't just a straight memory access.  */
      if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
          CPUIOTLBEntry *iotlbentry;
 +        bool need_swap;
          /* For anything that is unaligned, recurse through full_load.  */
          if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
              /* On watchpoint hit, this will longjmp out.  */
              cpu_check_watchpoint(env_cpu(env), addr, size,
                                   iotlbentry->attrs, BP_MEM_READ, retaddr);
 -
 -            /* The backing page may or may not require I/O.  */
 -            tlb_addr &= ~TLB_WATCHPOINT;
 -            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
 -                goto do_aligned_access;
 -            }
          }
-+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+         def = &tcg_op_defs[opc];
-+
+-        nb_oargs = def->nb_oargs;
-         /* Handle I/O access.  */
+-        nb_iargs = def->nb_iargs;
--        return io_readx(env, iotlbentry, mmu_idx, addr,
+-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
--                        retaddr, access_type, op);
+-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
-+        if (likely(tlb_addr & TLB_MMIO)) {
++        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
-+            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
++        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
-+                            access_type, op ^ (need_swap * MO_BSWAP));
-+        }
+         /* For commutative operations make constant second argument */
-+
+         switch (opc) {
-+        haddr = (void *)((uintptr_t)addr + entry->addend);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+
-+        /*
+         CASE_OP_32_64(qemu_ld):
-+         * Keep these two load_memop separate to ensure that the compiler
+             {
-+         * is able to fold the entire function to a single instruction.
+-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
-+         * There is a build-time assert inside to remind you of this.  ;-)
++                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-+         */
+                 MemOp mop = get_memop(oi);
-+        if (unlikely(need_swap)) {
+                 if (!(mop & MO_SIGN)) {
-+            return load_memop(haddr, op ^ MO_BSWAP);
+                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-+        }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 +        return load_memop(haddr, op);
      }
      /* Handle slow unaligned access (it spans two pages or IO).  */
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
          return res & MAKE_64BIT_MASK(0, size * 8);
      }
 - do_aligned_access:
      haddr = (void *)((uintptr_t)addr + entry->addend);
      return load_memop(haddr, op);
  }
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
      /* Handle anything that isn't just a straight memory access.  */
      if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
          CPUIOTLBEntry *iotlbentry;
 +        bool need_swap;
          /* For anything that is unaligned, recurse through byte stores.  */
          if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              /* On watchpoint hit, this will longjmp out.  */
              cpu_check_watchpoint(env_cpu(env), addr, size,
                                   iotlbentry->attrs, BP_MEM_WRITE, retaddr);
 -
 -            /* The backing page may or may not require I/O.  */
 -            tlb_addr &= ~TLB_WATCHPOINT;
 -            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
 -                goto do_aligned_access;
 -            }
          }
-+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+         if (partmask == 0) {
-+
+-            tcg_debug_assert(nb_oargs == 1);
-         /* Handle I/O access.  */
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
--        io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr, op);
+             continue;
-+        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
+         }
-+            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
+         if (affected == 0) {
-+                      op ^ (need_swap * MO_BSWAP));
+-            tcg_debug_assert(nb_oargs == 1);
-+            return;
+             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-+        }
+             continue;
-+
+         }
-+        haddr = (void *)((uintptr_t)addr + entry->addend);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+
+             } else if (args_are_copies(op->args[1], op->args[2])) {
-+        /*
+                 op->opc = INDEX_op_dup_vec;
-+         * Keep these two store_memop separate to ensure that the compiler
+                 TCGOP_VECE(op) = MO_32;
-+         * is able to fold the entire function to a single instruction.
+-                nb_iargs = 1;
-+         * There is a build-time assert inside to remind you of this.  ;-)
+             }
-+         */
+             break;
-+        if (unlikely(need_swap)) {
-+            store_memop(haddr, val, op ^ MO_BSWAP);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-+        } else {
+                 op->opc = opc = (opc == INDEX_op_movcond_i32
-+            store_memop(haddr, val, op);
+                                  ? INDEX_op_setcond_i32
-+        }
+                                  : INDEX_op_setcond_i64);
-         return;
+-                nb_iargs = 2;
-     }
+             }
+             break;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-         return;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     }
+         if (def->flags & TCG_OPF_BB_END) {
+             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-- do_aligned_access:
+         } else {
-     haddr = (void *)((uintptr_t)addr + entry->addend);
++            int nb_oargs = def->nb_oargs;
-     store_memop(haddr, val, op);
+             for (i = 0; i < nb_oargs; i++) {
- }
+                 reset_temp(op->args[i]);
                  /* Save the corresponding known-zero bits mask for the
 --
-.17.1
+.25.1

-New patch
+[PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond*
+Return -1 instead of 2 for failure, so that we can
 use comparisons against 0 for all cases.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 file changed, 74 insertions(+), 71 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
      }
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
 -                                       TCGArg y, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +                                    TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
          case TCG_COND_GEU:
              return 1;
          default:
 -            return 2;
 +            return -1;
          }
      }
 -    return 2;
 +    return -1;
  }
 -/* Return 2 if the condition can't be simplified, and the result
 -   of the condition (0 or 1) if it can */
 -static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 +/*
 + * Return -1 if the condition can't be simplified,
 + * and the result of the condition (0 or 1) if it can.
 + */
 +static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
  {
      TCGArg al = p1[0], ah = p1[1];
      TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
          return do_constant_folding_cond_eq(c);
      }
 -    return 2;
 +    return -1;
  }
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          CASE_OP_32_64(setcond):
 -            tmp = do_constant_folding_cond(opc, op->args[1],
 -                                           op->args[2], op->args[3]);
 -            if (tmp != 2) {
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[3]);
 +            if (i >= 0) {
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              break;
          CASE_OP_32_64(brcond):
 -            tmp = do_constant_folding_cond(opc, op->args[0],
 -                                           op->args[1], op->args[2]);
 -            switch (tmp) {
 -            case 0:
 +            i = do_constant_folding_cond(opc, op->args[0],
 +                                         op->args[1], op->args[2]);
 +            if (i == 0) {
                  tcg_op_remove(s, op);
                  continue;
 -            case 1:
 +            } else if (i > 0) {
                  memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          CASE_OP_32_64(movcond):
 -            tmp = do_constant_folding_cond(opc, op->args[1],
 -                                           op->args[2], op->args[5]);
 -            if (tmp != 2) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
 +            i = do_constant_folding_cond(opc, op->args[1],
 +                                         op->args[2], op->args[5]);
 +            if (i >= 0) {
 +                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                  continue;
              }
              if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          case INDEX_op_brcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
 -                                            op->args[4]);
 -            if (tmp == 0) {
 +            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
 +                                          op->args[4]);
 +            if (i == 0) {
              do_brcond_false:
                  tcg_op_remove(s, op);
                  continue;
              }
 -            if (tmp == 1) {
 +            if (i > 0) {
              do_brcond_true:
                  op->opc = opc = INDEX_op_br;
                  op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_brcond_false;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[4] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[0], op->args[2],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[0], op->args[2],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_brcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_brcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_brcond_true;
                  }
              }
              break;
          case INDEX_op_setcond2_i32:
 -            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
 -                                            op->args[5]);
 -            if (tmp != 2) {
 +            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
 +                                          op->args[5]);
 +            if (i >= 0) {
              do_setcond_const:
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 +                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                  continue;
              }
              if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_EQ) {
                  /* Simplify EQ comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_const;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_high;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_EQ);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_EQ);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp != 1) {
 +                } else if (i < 0) {
                      break;
                  }
              do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              if (op->args[5] == TCG_COND_NE) {
                  /* Simplify NE comparisons where one of the pairs
                     can be simplified.  */
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[1], op->args[3],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[1], op->args[3],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_high;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
 -                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
 -                                               op->args[2], op->args[4],
 -                                               TCG_COND_NE);
 -                if (tmp == 0) {
 +                i = do_constant_folding_cond(INDEX_op_setcond_i32,
 +                                             op->args[2], op->args[4],
 +                                             TCG_COND_NE);
 +                if (i == 0) {
                      goto do_setcond_low;
 -                } else if (tmp == 1) {
 +                } else if (i > 0) {
                      goto do_setcond_const;
                  }
              }
 --
 .25.1

-[PULL 13/16] cputlb: Remove cpu->mem_io_vaddr
+[PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
-With the merge of notdirty handling into store_helper,
+This will allow callers to tail call to these functions
-the last user of cpu->mem_io_vaddr was removed.
+and return true indicating processing complete.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/hw/core/cpu.h | 2 --
+ tcg/optimize.c | 9 +++++----
- accel/tcg/cputlb.c    | 2 --
+file changed, 5 insertions(+), 4 deletions(-)
  hw/core/cpu.c         | 1 -
 files changed, 5 deletions(-)
-diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/hw/core/cpu.h
+--- a/tcg/optimize.c
-+++ b/include/hw/core/cpu.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
+@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
-  * @next_cpu: Next CPU sharing TB cache.
+     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
-  * @opaque: User data.
+ }
-  * @mem_io_pc: Host Program Counter at which the memory was accessed.
-- * @mem_io_vaddr: Target virtual address at which the memory was accessed.
+-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-  * @kvm_fd: vCPU file descriptor for KVM.
++static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-  * @work_mutex: Lock to prevent multiple access to queued_work_*.
+ {
-  * @queued_work_first: First asynchronous work pending.
+     TCGTemp *dst_ts = arg_temp(dst);
-@@ -XXX,XX +XXX,XX @@ struct CPUState {
+     TCGTemp *src_ts = arg_temp(src);
-      * we store some rarely used information in the CPU context.
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-      */
-     uintptr_t mem_io_pc;
+     if (ts_are_copies(dst_ts, src_ts)) {
--    vaddr mem_io_vaddr;
+         tcg_op_remove(ctx->tcg, op);
-     /*
+-        return;
-      * This is only needed for the legacy cpu_unassigned_access() hook;
++        return true;
       * when all targets using it have been converted to use
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
          cpu_io_recompile(cpu, retaddr);
      }
--    cpu->mem_io_vaddr = addr;
+     reset_ts(dst_ts);
-     cpu->mem_io_access_type = access_type;
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+         di->is_const = si->is_const;
-     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
+         di->val = si->val;
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      if (!cpu->can_do_io) {
          cpu_io_recompile(cpu, retaddr);
      }
--    cpu->mem_io_vaddr = addr;
++    return true;
-     cpu->mem_io_pc = retaddr;
+ }
-     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
+-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-diff --git a/hw/core/cpu.c b/hw/core/cpu.c
++static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-index XXXXXXX..XXXXXXX 100644
+                              TCGArg dst, uint64_t val)
---- a/hw/core/cpu.c
+ {
-+++ b/hw/core/cpu.c
+     const TCGOpDef *def = &tcg_op_defs[op->opc];
-@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(CPUState *cpu)
+@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-     cpu->interrupt_request = 0;
+     /* Convert movi to mov with constant temp. */
-     cpu->halted = 0;
+     tv = tcg_constant_internal(type, val);
-     cpu->mem_io_pc = 0;
+     init_ts_info(ctx, tv);
--    cpu->mem_io_vaddr = 0;
+-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-     cpu->icount_extra = 0;
++    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-     atomic_set(&cpu->icount_decr_ptr->u32, 0);
+ }
-     cpu->can_do_io = 1;
  static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
 --
-.17.1
+.25.1

-[PULL 07/16] exec: Adjust notdirty tracing
+[PULL 17/56] tcg/optimize: Split out finish_folding
-The memory_region_tb_read tracepoint is unreachable, since notdirty
+Copy z_mask into OptContext, for writeback to the
-is supposed to apply only to writes.  The memory_region_tb_write
+first output within the new function.
 tracepoint is mis-named, because notdirty is not only used for TB
 invalidation.  It is also used for e.g. VGA RAM updates and migration.
 Replace memory_region_tb_write with memory_notdirty_write_access,
 and place it in memory_notdirty_write_prepare where it can catch
 all of the instances.  Add memory_notdirty_set_dirty to log when
 we no longer intercept writes to a page.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- exec.c       | 3 +++
+ tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
- memory.c     | 4 ----
+file changed, 33 insertions(+), 16 deletions(-)
  trace-events | 4 ++--
 files changed, 5 insertions(+), 6 deletions(-)
-diff --git a/exec.c b/exec.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/exec.c
+--- a/tcg/optimize.c
-+++ b/exec.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
-     ndi->size = size;
+     TCGContext *tcg;
-     ndi->pages = NULL;
+     TCGOp *prev_mb;
+     TCGTempSet temps_used;
 +    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
 +
-     assert(tcg_enabled());
++    /* In flight values from optimization. */
-     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
++    uint64_t z_mask;
-         ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
+ } OptContext;
-@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
-     /* we remove the notdirty callback only if the code has been
+ static inline TempOptInfo *ts_info(TCGTemp *ts)
-        flushed */
+@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
      if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
 +        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
          tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
      }
  }
-diff --git a/memory.c b/memory.c
-index XXXXXXX..XXXXXXX 100644
++static void finish_folding(OptContext *ctx, TCGOp *op)
---- a/memory.c
++{
-+++ b/memory.c
++    const TCGOpDef *def = &tcg_op_defs[op->opc];
-@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
++    int i, nb_oargs;
-         /* Accesses to code which has previously been translated into a TB show
++
-          * up in the MMIO path, as accesses to the io_mem_notdirty
++    /*
-          * MemoryRegion. */
++     * For an opcode that ends a BB, reset all temp data.
--        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
++     * We do no cross-BB optimization.
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
++     */
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
++    if (def->flags & TCG_OPF_BB_END) {
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
++        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
++        ctx->prev_mb = NULL;
-         /* Accesses to code which has previously been translated into a TB show
++        return;
-          * up in the MMIO path, as accesses to the io_mem_notdirty
++    }
-          * MemoryRegion. */
++
--        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
++    nb_oargs = def->nb_oargs;
-     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
++    for (i = 0; i < nb_oargs; i++) {
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
++        reset_temp(op->args[i]);
-         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
++        /*
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
++         * Save the corresponding known-zero bits mask for the
-         /* Accesses to code which has previously been translated into a TB show
++         * first output argument (only one supported so far).
-          * up in the MMIO path, as accesses to the io_mem_notdirty
++         */
-          * MemoryRegion. */
++        if (i == 0) {
--        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
++            arg_info(op->args[i])->z_mask = ctx->z_mask;
-     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
++        }
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
++    }
-         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
++}
-@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
++
-         /* Accesses to code which has previously been translated into a TB show
+ static bool fold_call(OptContext *ctx, TCGOp *op)
-          * up in the MMIO path, as accesses to the io_mem_notdirty
+ {
-          * MemoryRegion. */
+     TCGContext *s = ctx->tcg;
--        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
-     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
+             partmask &= 0xffffffffu;
-         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
+             affected &= 0xffffffffu;
-         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
+         }
-diff --git a/trace-events b/trace-events
++        ctx.z_mask = z_mask;
-index XXXXXXX..XXXXXXX 100644
---- a/trace-events
+         if (partmask == 0) {
-+++ b/trace-events
+             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-@@ -XXX,XX +XXX,XX @@ dma_map_wait(void *dbs) "dbs=%p"
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
- find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64
+             break;
- find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
+         }
- ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
-+memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
+-        /* Some of the folding above can change opc. */
-+memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
+-        opc = op->opc;
+-        def = &tcg_op_defs[opc];
- # memory.c
+-        if (def->flags & TCG_OPF_BB_END) {
- memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
- memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-        } else {
- memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-            int nb_oargs = def->nb_oargs;
- memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-            for (i = 0; i < nb_oargs; i++) {
--memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-                reset_temp(op->args[i]);
--memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-                /* Save the corresponding known-zero bits mask for the
- memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-                   first output argument (only one supported so far). */
- memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
+-                if (i == 0) {
- flatview_new(void *view, void *root) "%p (root %p)"
+-                    arg_info(op->args[i])->z_mask = z_mask;
 -                }
 -            }
 -        }
 +        finish_folding(&ctx, op);
          /* Eliminate duplicate and redundant fence instructions.  */
          if (ctx.prev_mb) {
 --
-.17.1
+.25.1

-New patch
+[PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 9 ++++++---
+file changed, 6 insertions(+), 3 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         uint64_t z_mask, partmask, affected, tmp;
+         TCGOpcode opc = op->opc;
+         const TCGOpDef *def;
++        bool done = false;
+         /* Calls are special. */
+         if (opc == INDEX_op_call) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+            allocator where needed and possible.  Also detect copies. */
+         switch (opc) {
+         CASE_OP_32_64_VEC(mov):
+-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            continue;
++            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
++            break;
+         case INDEX_op_dup_vec:
+             if (arg_is_const(op->args[1])) {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        finish_folding(&ctx, op);
++        if (!done) {
++            finish_folding(&ctx, op);
++        }
+         /* Eliminate duplicate and redundant fence instructions.  */
+         if (ctx.prev_mb) {
+--
+.25.1

-New patch
+[PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
+This puts the separate mb optimization into the same framework
+as the others.  While fold_qemu_{ld,st} are currently identical,
+that won't last as more code gets moved.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
+file changed, 51 insertions(+), 38 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mb(OptContext *ctx, TCGOp *op)
++{
++    /* Eliminate duplicate and redundant fence instructions.  */
++    if (ctx->prev_mb) {
++        /*
++         * Merge two barriers of the same type into one,
++         * or a weaker barrier into a stronger one,
++         * or two weaker barriers into a stronger one.
++         *   mb X; mb Y => mb X|Y
++         *   mb; strl => mb; st
++         *   ldaq; mb => ld; mb
++         *   ldaq; strl => ld; mb; st
++         * Other combinations are also merged into a strong
++         * barrier.  This is stricter than specified but for
++         * the purposes of TCG is better than not optimizing.
++         */
++        ctx->prev_mb->args[0] |= op->args[0];
++        tcg_op_remove(ctx->tcg, op);
++    } else {
++        ctx->prev_mb = op;
++    }
++    return true;
++}
++
++static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
++static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
++{
++    /* Opcodes that touch guest memory stop the mb optimization.  */
++    ctx->prev_mb = NULL;
++    return false;
++}
++
+ /* Propagate constants and copies, fold constant expressions. */
+ void tcg_optimize(TCGContext *s)
+ {
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
++        case INDEX_op_mb:
++            done = fold_mb(&ctx, op);
++            break;
++        case INDEX_op_qemu_ld_i32:
++        case INDEX_op_qemu_ld_i64:
++            done = fold_qemu_ld(&ctx, op);
++            break;
++        case INDEX_op_qemu_st_i32:
++        case INDEX_op_qemu_st8_i32:
++        case INDEX_op_qemu_st_i64:
++            done = fold_qemu_st(&ctx, op);
++            break;
++
+         default:
+             break;
+         }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         if (!done) {
+             finish_folding(&ctx, op);
+         }
+-
+-        /* Eliminate duplicate and redundant fence instructions.  */
+-        if (ctx.prev_mb) {
+-            switch (opc) {
+-            case INDEX_op_mb:
+-                /* Merge two barriers of the same type into one,
+-                 * or a weaker barrier into a stronger one,
+-                 * or two weaker barriers into a stronger one.
+-                 *   mb X; mb Y => mb X|Y
+-                 *   mb; strl => mb; st
+-                 *   ldaq; mb => ld; mb
+-                 *   ldaq; strl => ld; mb; st
+-                 * Other combinations are also merged into a strong
+-                 * barrier.  This is stricter than specified but for
+-                 * the purposes of TCG is better than not optimizing.
+-                 */
+-                ctx.prev_mb->args[0] |= op->args[0];
+-                tcg_op_remove(s, op);
+-                break;
+-
+-            default:
+-                /* Opcodes that end the block stop the optimization.  */
+-                if ((def->flags & TCG_OPF_BB_END) == 0) {
+-                    break;
+-                }
+-                /* fallthru */
+-            case INDEX_op_qemu_ld_i32:
+-            case INDEX_op_qemu_ld_i64:
+-            case INDEX_op_qemu_st_i32:
+-            case INDEX_op_qemu_st8_i32:
+-            case INDEX_op_qemu_st_i64:
+-                /* Opcodes that touch guest memory stop the optimization.  */
+-                ctx.prev_mb = NULL;
+-                break;
+-            }
+-        } else if (opc == INDEX_op_mb) {
+-            ctx.prev_mb = op;
+-        }
+     }
+ }
+--
+.25.1

-New patch
+[PULL 20/56] tcg/optimize: Split out fold_const{1,2}
+Split out a whole bunch of placeholder functions, which are
 currently identical.  That won't last as more code gets moved.
 Use CASE_32_64_VEC for some logical operators that previously
 missed the addition of vectors.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 file changed, 219 insertions(+), 52 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      }
  }
 +/*
 + * The fold_* functions return true when processing is complete,
 + * usually by folding the operation to a constant or to a copy,
 + * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
 + * like collect information about the value produced, for use in
 + * optimizing a subsequent operation.
 + *
 + * These first fold_* functions are all helpers, used by other
 + * folders for more specific operations.
 + */
 +
 +static bool fold_const1(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1])) {
 +        uint64_t t;
 +
 +        t = arg_info(op->args[1])->val;
 +        t = do_constant_folding(op->opc, t, 0);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
 +    }
 +    return false;
 +}
 +
 +static bool fold_const2(OptContext *ctx, TCGOp *op)
 +{
 +    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 +        uint64_t t1 = arg_info(op->args[1])->val;
 +        uint64_t t2 = arg_info(op->args[2])->val;
 +
 +        t1 = do_constant_folding(op->opc, t1, t2);
 +        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
 +    }
 +    return false;
 +}
 +
 +/*
 + * These outermost fold_<op> functions are sorted alphabetically.
 + */
 +
 +static bool fold_add(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_and(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_andc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_call(OptContext *ctx, TCGOp *op)
  {
      TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
      return true;
  }
 +static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_divide(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_eqv(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_exts(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_extu(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
  static bool fold_mb(OptContext *ctx, TCGOp *op)
  {
      /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
      return true;
  }
 +static bool fold_mul(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_nand(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_neg(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_nor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_not(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const1(ctx, op);
 +}
 +
 +static bool fold_or(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_orc(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_remainder(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_shift(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_sub(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
 +static bool fold_xor(OptContext *ctx, TCGOp *op)
 +{
 +    return fold_const2(ctx, op);
 +}
 +
  /* Propagate constants and copies, fold constant expressions. */
  void tcg_optimize(TCGContext *s)
  {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(not):
 -        CASE_OP_32_64(neg):
 -        CASE_OP_32_64(ext8s):
 -        CASE_OP_32_64(ext8u):
 -        CASE_OP_32_64(ext16s):
 -        CASE_OP_32_64(ext16u):
 -        CASE_OP_32_64(ctpop):
 -        case INDEX_op_ext32s_i64:
 -        case INDEX_op_ext32u_i64:
 -        case INDEX_op_ext_i32_i64:
 -        case INDEX_op_extu_i32_i64:
 -        case INDEX_op_extrl_i64_i32:
 -        case INDEX_op_extrh_i64_i32:
 -            if (arg_is_const(op->args[1])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(bswap16):
          CASE_OP_32_64(bswap32):
          case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 -        CASE_OP_32_64(add):
 -        CASE_OP_32_64(sub):
 -        CASE_OP_32_64(mul):
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(and):
 -        CASE_OP_32_64(xor):
 -        CASE_OP_32_64(shl):
 -        CASE_OP_32_64(shr):
 -        CASE_OP_32_64(sar):
 -        CASE_OP_32_64(rotl):
 -        CASE_OP_32_64(rotr):
 -        CASE_OP_32_64(andc):
 -        CASE_OP_32_64(orc):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -        CASE_OP_32_64(div):
 -        CASE_OP_32_64(divu):
 -        CASE_OP_32_64(rem):
 -        CASE_OP_32_64(remu):
 -            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
 -                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
 -                                          arg_info(op->args[2])->val);
 -                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
 -                continue;
 -            }
 -            break;
 -
          CASE_OP_32_64(clz):
          CASE_OP_32_64(ctz):
              if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              }
              break;
 +        default:
 +            break;
 +
 +        /* ---------------------------------------------------------- */
 +        /* Sorted alphabetically by opcode as much as possible. */
 +
 +        CASE_OP_32_64_VEC(add):
 +            done = fold_add(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(and):
 +            done = fold_and(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(andc):
 +            done = fold_andc(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ctpop):
 +            done = fold_ctpop(&ctx, op);
 +            break;
 +        CASE_OP_32_64(div):
 +        CASE_OP_32_64(divu):
 +            done = fold_divide(&ctx, op);
 +            break;
 +        CASE_OP_32_64(eqv):
 +            done = fold_eqv(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8s):
 +        CASE_OP_32_64(ext16s):
 +        case INDEX_op_ext32s_i64:
 +        case INDEX_op_ext_i32_i64:
 +            done = fold_exts(&ctx, op);
 +            break;
 +        CASE_OP_32_64(ext8u):
 +        CASE_OP_32_64(ext16u):
 +        case INDEX_op_ext32u_i64:
 +        case INDEX_op_extu_i32_i64:
 +        case INDEX_op_extrl_i64_i32:
 +        case INDEX_op_extrh_i64_i32:
 +            done = fold_extu(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 +        CASE_OP_32_64(mul):
 +            done = fold_mul(&ctx, op);
 +            break;
 +        CASE_OP_32_64(mulsh):
 +        CASE_OP_32_64(muluh):
 +            done = fold_mul_highpart(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nand):
 +            done = fold_nand(&ctx, op);
 +            break;
 +        CASE_OP_32_64(neg):
 +            done = fold_neg(&ctx, op);
 +            break;
 +        CASE_OP_32_64(nor):
 +            done = fold_nor(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(not):
 +            done = fold_not(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(or):
 +            done = fold_or(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(orc):
 +            done = fold_orc(&ctx, op);
 +            break;
          case INDEX_op_qemu_ld_i32:
          case INDEX_op_qemu_ld_i64:
              done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_qemu_st_i64:
              done = fold_qemu_st(&ctx, op);
              break;
 -
 -        default:
 +        CASE_OP_32_64(rem):
 +        CASE_OP_32_64(remu):
 +            done = fold_remainder(&ctx, op);
 +            break;
 +        CASE_OP_32_64(rotl):
 +        CASE_OP_32_64(rotr):
 +        CASE_OP_32_64(sar):
 +        CASE_OP_32_64(shl):
 +        CASE_OP_32_64(shr):
 +            done = fold_shift(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(sub):
 +            done = fold_sub(&ctx, op);
 +            break;
 +        CASE_OP_32_64_VEC(xor):
 +            done = fold_xor(&ctx, op);
              break;
          }
 --
 .25.1

-New patch
+[PULL 21/56] tcg/optimize: Split out fold_setcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
+file changed, 72 insertions(+), 73 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_setcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
++            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
++            goto do_setcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            goto do_setcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
++                                     op->args[4], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_setcond_const;
++        case 1:
++            op->args[2] = op->args[3];
++            op->args[3] = cond;
++            op->opc = INDEX_op_setcond_i32;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_setcond_high:
++        op->args[1] = op->args[2];
++        op->args[2] = op->args[4];
++        op->args[3] = cond;
++        op->opc = INDEX_op_setcond_i32;
++        break;
++    }
++    return false;
++
++ do_setcond_const:
++    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_setcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+-                                          op->args[5]);
+-            if (i >= 0) {
+-            do_setcond_const:
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0
+-                 && arg_is_const(op->args[4])
+-                 && arg_info(op->args[4])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_setcond_high:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_const;
+-                } else if (i > 0) {
+-                    goto do_setcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_setcond_low:
+-                reset_temp(op->args[0]);
+-                arg_info(op->args[0])->z_mask = 1;
+-                op->opc = INDEX_op_setcond_i32;
+-                op->args[2] = op->args[3];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[5] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_high;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+-                                             op->args[2], op->args[4],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_setcond_low;
+-                } else if (i > 0) {
+-                    goto do_setcond_const;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        case INDEX_op_setcond2_i32:
++            done = fold_setcond2(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 22/56] tcg/optimize: Split out fold_brcond2
+Reduce some code duplication by folding the NE and EQ cases.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
+file changed, 81 insertions(+), 78 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond2(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[4];
++    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
++    TCGArg label = op->args[5];
++    int inv = 0;
++
++    if (i >= 0) {
++        goto do_brcond_const;
++    }
++
++    switch (cond) {
++    case TCG_COND_LT:
++    case TCG_COND_GE:
++        /*
++         * Simplify LT/GE comparisons vs zero to a single compare
++         * vs the high word of the input.
++         */
++        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
++            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
++            goto do_brcond_high;
++        }
++        break;
++
++    case TCG_COND_NE:
++        inv = 1;
++        QEMU_FALLTHROUGH;
++    case TCG_COND_EQ:
++        /*
++         * Simplify EQ/NE comparisons where one of the pairs
++         * can be simplified.
++         */
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
++                                     op->args[2], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            goto do_brcond_high;
++        }
++
++        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
++                                     op->args[3], cond);
++        switch (i ^ inv) {
++        case 0:
++            goto do_brcond_const;
++        case 1:
++            op->opc = INDEX_op_brcond_i32;
++            op->args[1] = op->args[2];
++            op->args[2] = cond;
++            op->args[3] = label;
++            break;
++        }
++        break;
++
++    default:
++        break;
++
++    do_brcond_high:
++        op->opc = INDEX_op_brcond_i32;
++        op->args[0] = op->args[1];
++        op->args[1] = op->args[3];
++        op->args[2] = cond;
++        op->args[3] = label;
++        break;
++
++    do_brcond_const:
++        if (i == 0) {
++            tcg_op_remove(ctx->tcg, op);
++            return true;
++        }
++        op->opc = INDEX_op_br;
++        op->args[0] = label;
++        break;
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_brcond2_i32:
+-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+-                                          op->args[4]);
+-            if (i == 0) {
+-            do_brcond_false:
+-                tcg_op_remove(s, op);
+-                continue;
+-            }
+-            if (i > 0) {
+-            do_brcond_true:
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[5];
+-                break;
+-            }
+-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+-                 && arg_is_const(op->args[2])
+-                 && arg_info(op->args[2])->val == 0
+-                 && arg_is_const(op->args[3])
+-                 && arg_info(op->args[3])->val == 0) {
+-                /* Simplify LT/GE comparisons vs zero to a single compare
+-                   vs the high word of the input.  */
+-            do_brcond_high:
+-                op->opc = opc = INDEX_op_brcond_i32;
+-                op->args[0] = op->args[1];
+-                op->args[1] = op->args[3];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_EQ) {
+-                /* Simplify EQ comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i > 0) {
+-                    goto do_brcond_high;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_EQ);
+-                if (i == 0) {
+-                    goto do_brcond_false;
+-                } else if (i < 0) {
+-                    break;
+-                }
+-            do_brcond_low:
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = INDEX_op_brcond_i32;
+-                op->args[1] = op->args[2];
+-                op->args[2] = op->args[4];
+-                op->args[3] = op->args[5];
+-                break;
+-            }
+-            if (op->args[4] == TCG_COND_NE) {
+-                /* Simplify NE comparisons where one of the pairs
+-                   can be simplified.  */
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[0], op->args[2],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_high;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+-                                             op->args[1], op->args[3],
+-                                             TCG_COND_NE);
+-                if (i == 0) {
+-                    goto do_brcond_low;
+-                } else if (i > 0) {
+-                    goto do_brcond_true;
+-                }
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        case INDEX_op_brcond2_i32:
++            done = fold_brcond2(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 23/56] tcg/optimize: Split out fold_brcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 33 +++++++++++++++++++--------------
+file changed, 19 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_brcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[2];
++    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
++
++    if (i == 0) {
++        tcg_op_remove(ctx->tcg, op);
++        return true;
++    }
++    if (i > 0) {
++        op->opc = INDEX_op_br;
++        op->args[0] = op->args[3];
++    }
++    return false;
++}
++
+ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[4];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(brcond):
+-            i = do_constant_folding_cond(opc, op->args[0],
+-                                         op->args[1], op->args[2]);
+-            if (i == 0) {
+-                tcg_op_remove(s, op);
+-                continue;
+-            } else if (i > 0) {
+-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+-                op->opc = opc = INDEX_op_br;
+-                op->args[0] = op->args[3];
+-                break;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(andc):
+             done = fold_andc(&ctx, op);
+             break;
++        CASE_OP_32_64(brcond):
++            done = fold_brcond(&ctx, op);
++            break;
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 24/56] tcg/optimize: Split out fold_setcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 23 ++++++++++++++---------
+file changed, 14 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_setcond(OptContext *ctx, TCGOp *op)
++{
++    TCGCond cond = op->args[3];
++    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+ {
+     TCGCond cond = op->args[5];
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(setcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[3]);
+-            if (i >= 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
+-                continue;
+-            }
+-            break;
+-
+         CASE_OP_32_64(movcond):
+             i = do_constant_folding_cond(opc, op->args[1],
+                                          op->args[2], op->args[5]);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(shr):
+             done = fold_shift(&ctx, op);
+             break;
++        CASE_OP_32_64(setcond):
++            done = fold_setcond(&ctx, op);
++            break;
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 25/56] tcg/optimize: Split out fold_mulu2_i32
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 37 +++++++++++++++++++++----------------
+file changed, 21 insertions(+), 16 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
++        uint32_t a = arg_info(op->args[2])->val;
++        uint32_t b = arg_info(op->args[3])->val;
++        uint64_t r = (uint64_t)a * b;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++        return true;
++    }
++    return false;
++}
++
+ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_mulu2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-                uint32_t a = arg_info(op->args[2])->val;
+-                uint32_t b = arg_info(op->args[3])->val;
+-                uint64_t r = (uint64_t)a * b;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
++        case INDEX_op_mulu2_i32:
++            done = fold_mulu2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 26/56] tcg/optimize: Split out fold_addsub2_i32
+Add two additional helpers, fold_add2_i32 and fold_sub2_i32
+which will not be simple wrappers forever.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
+file changed, 44 insertions(+), 26 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
++{
++    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
++        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
++        uint32_t al = arg_info(op->args[2])->val;
++        uint32_t ah = arg_info(op->args[3])->val;
++        uint32_t bl = arg_info(op->args[4])->val;
++        uint32_t bh = arg_info(op->args[5])->val;
++        uint64_t a = ((uint64_t)ah << 32) | al;
++        uint64_t b = ((uint64_t)bh << 32) | bl;
++        TCGArg rl, rh;
++        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++
++        if (add) {
++            a += b;
++        } else {
++            a -= b;
++        }
++
++        rl = op->args[0];
++        rh = op->args[1];
++        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
++        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
++        return true;
++    }
++    return false;
++}
++
++static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, true);
++}
++
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++{
++    return fold_addsub2_i32(ctx, op, false);
++}
++
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        case INDEX_op_add2_i32:
+-        case INDEX_op_sub2_i32:
+-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
+-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+-                uint32_t al = arg_info(op->args[2])->val;
+-                uint32_t ah = arg_info(op->args[3])->val;
+-                uint32_t bl = arg_info(op->args[4])->val;
+-                uint32_t bh = arg_info(op->args[5])->val;
+-                uint64_t a = ((uint64_t)ah << 32) | al;
+-                uint64_t b = ((uint64_t)bh << 32) | bl;
+-                TCGArg rl, rh;
+-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
+-
+-                if (opc == INDEX_op_add2_i32) {
+-                    a += b;
+-                } else {
+-                    a -= b;
+-                }
+-
+-                rl = op->args[0];
+-                rh = op->args[1];
+-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
+-                continue;
+-            }
+-            break;
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
++        case INDEX_op_add2_i32:
++            done = fold_add2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
++        case INDEX_op_sub2_i32:
++            done = fold_sub2_i32(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 27/56] tcg/optimize: Split out fold_movcond
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
+file changed, 31 insertions(+), 25 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_movcond(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode opc = op->opc;
++    TCGCond cond = op->args[5];
++    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
++
++    if (i >= 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
++    }
++
++    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
++        uint64_t tv = arg_info(op->args[3])->val;
++        uint64_t fv = arg_info(op->args[4])->val;
++
++        opc = (opc == INDEX_op_movcond_i32
++               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
++
++        if (tv == 1 && fv == 0) {
++            op->opc = opc;
++            op->args[3] = cond;
++        } else if (fv == 1 && tv == 0) {
++            op->opc = opc;
++            op->args[3] = tcg_invert_cond(cond);
++        }
++    }
++    return false;
++}
++
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(movcond):
+-            i = do_constant_folding_cond(opc, op->args[1],
+-                                         op->args[2], op->args[5]);
+-            if (i >= 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
+-                continue;
+-            }
+-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+-                uint64_t tv = arg_info(op->args[3])->val;
+-                uint64_t fv = arg_info(op->args[4])->val;
+-                TCGCond cond = op->args[5];
+-
+-                if (fv == 1 && tv == 0) {
+-                    cond = tcg_invert_cond(cond);
+-                } else if (!(tv == 1 && fv == 0)) {
+-                    break;
+-                }
+-                op->args[3] = cond;
+-                op->opc = opc = (opc == INDEX_op_movcond_i32
+-                                 ? INDEX_op_setcond_i32
+-                                 : INDEX_op_setcond_i64);
+-            }
+-            break;
+-
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64(movcond):
++            done = fold_movcond(&ctx, op);
++            break;
+         CASE_OP_32_64(mul):
+             done = fold_mul(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 28/56] tcg/optimize: Split out fold_extract2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
+file changed, 22 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t v1 = arg_info(op->args[1])->val;
++        uint64_t v2 = arg_info(op->args[2])->val;
++        int shr = op->args[3];
++
++        if (op->opc == INDEX_op_extract2_i64) {
++            v1 >>= shr;
++            v2 <<= 64 - shr;
++        } else {
++            v1 = (uint32_t)v1 >> shr;
++            v2 = (int32_t)v2 << (32 - shr);
++        }
++        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
++    }
++    return false;
++}
++
+ static bool fold_exts(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract2):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                uint64_t v1 = arg_info(op->args[1])->val;
+-                uint64_t v2 = arg_info(op->args[2])->val;
+-                int shr = op->args[3];
+-
+-                if (opc == INDEX_op_extract2_i64) {
+-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
+-                } else {
+-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
+-                                    ((uint32_t)v2 << (32 - shr)));
+-                }
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract2):
++            done = fold_extract2(&ctx, op);
++            break;
+         CASE_OP_32_64(ext8s):
+         CASE_OP_32_64(ext16s):
+         case INDEX_op_ext32s_i64:
+--
+.25.1

-New patch
+[PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
+file changed, 30 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_extract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = extract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_extract2(OptContext *ctx, TCGOp *op)
+ {
+     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+ }
++static bool fold_sextract(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t;
++
++        t = arg_info(op->args[1])->val;
++        t = sextract64(t, op->args[2], op->args[3]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(extract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = extract64(arg_info(op->args[1])->val,
+-                                op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        CASE_OP_32_64(sextract):
+-            if (arg_is_const(op->args[1])) {
+-                tmp = sextract64(arg_info(op->args[1])->val,
+-                                 op->args[2], op->args[3]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
++        CASE_OP_32_64(extract):
++            done = fold_extract(&ctx, op);
++            break;
+         CASE_OP_32_64(extract2):
+             done = fold_extract2(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_setcond2_i32:
+             done = fold_setcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(sextract):
++            done = fold_sextract(&ctx, op);
++            break;
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 30/56] tcg/optimize: Split out fold_deposit
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 25 +++++++++++++++----------
+file changed, 15 insertions(+), 10 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+     return fold_const1(ctx, op);
+ }
++static bool fold_deposit(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t1 = arg_info(op->args[1])->val;
++        uint64_t t2 = arg_info(op->args[2])->val;
++
++        t1 = deposit64(t1, op->args[3], op->args[4], t2);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
++    }
++    return false;
++}
++
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(deposit):
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tmp = deposit64(arg_info(op->args[1])->val,
+-                                op->args[3], op->args[4],
+-                                arg_info(op->args[2])->val);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
++        CASE_OP_32_64(deposit):
++            done = fold_deposit(&ctx, op);
++            break;
+         CASE_OP_32_64(div):
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+--
+.25.1

-New patch
+[PULL 31/56] tcg/optimize: Split out fold_count_zeros
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 32 ++++++++++++++++++--------------
+file changed, 18 insertions(+), 14 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        if (t != 0) {
++            t = do_constant_folding(op->opc, t, 0);
++            return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++        }
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
++    }
++    return false;
++}
++
+ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const1(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(clz):
+-        CASE_OP_32_64(ctz):
+-            if (arg_is_const(op->args[1])) {
+-                TCGArg v = arg_info(op->args[1])->val;
+-                if (v != 0) {
+-                    tmp = do_constant_folding(opc, v, 0);
+-                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                } else {
+-                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
+-                }
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(clz):
++        CASE_OP_32_64(ctz):
++            done = fold_count_zeros(&ctx, op);
++            break;
+         CASE_OP_32_64(ctpop):
+             done = fold_ctpop(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 32/56] tcg/optimize: Split out fold_bswap
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++++-----------
+file changed, 16 insertions(+), 11 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++static bool fold_bswap(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++
++        t = do_constant_folding(op->opc, t, op->args[2]);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
+ static bool fold_call(OptContext *ctx, TCGOp *op)
+ {
+     TCGContext *s = ctx->tcg;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             }
+             break;
+-        CASE_OP_32_64(bswap16):
+-        CASE_OP_32_64(bswap32):
+-        case INDEX_op_bswap64_i64:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
+-                                          op->args[2]);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_brcond2_i32:
+             done = fold_brcond2(&ctx, op);
+             break;
++        CASE_OP_32_64(bswap16):
++        CASE_OP_32_64(bswap32):
++        case INDEX_op_bswap64_i64:
++            done = fold_bswap(&ctx, op);
++            break;
+         CASE_OP_32_64(clz):
+         CASE_OP_32_64(ctz):
+             done = fold_count_zeros(&ctx, op);
+--
+.25.1

-New patch
+[PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
+file changed, 31 insertions(+), 22 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_dup(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1])) {
++        uint64_t t = arg_info(op->args[1])->val;
++        t = dup_const(TCGOP_VECE(op), t);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++    return false;
++}
++
++static bool fold_dup2(OptContext *ctx, TCGOp *op)
++{
++    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
++        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
++                               arg_info(op->args[2])->val);
++        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
++    }
++
++    if (args_are_copies(op->args[1], op->args[2])) {
++        op->opc = INDEX_op_dup_vec;
++        TCGOP_VECE(op) = MO_32;
++    }
++    return false;
++}
++
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ {
+     return fold_const2(ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+             break;
+-        case INDEX_op_dup_vec:
+-            if (arg_is_const(op->args[1])) {
+-                tmp = arg_info(op->args[1])->val;
+-                tmp = dup_const(TCGOP_VECE(op), tmp);
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+-                continue;
+-            }
+-            break;
+-
+-        case INDEX_op_dup2_vec:
+-            assert(TCG_TARGET_REG_BITS == 32);
+-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0],
+-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
+-                                           arg_info(op->args[2])->val));
+-                continue;
+-            } else if (args_are_copies(op->args[1], op->args[2])) {
+-                op->opc = INDEX_op_dup_vec;
+-                TCGOP_VECE(op) = MO_32;
+-            }
+-            break;
+-
+         default:
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(divu):
+             done = fold_divide(&ctx, op);
+             break;
++        case INDEX_op_dup_vec:
++            done = fold_dup(&ctx, op);
++            break;
++        case INDEX_op_dup2_vec:
++            done = fold_dup2(&ctx, op);
++            break;
+         CASE_OP_32_64(eqv):
+             done = fold_eqv(&ctx, op);
+             break;
+--
+.25.1

-New patch
+[PULL 34/56] tcg/optimize: Split out fold_mov
+This is the final entry in the main switch that was in a
+different form.  After this, we have the option to convert
+the switch into a function dispatch table.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 27 ++++++++++++++-------------
+file changed, 14 insertions(+), 13 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
+     return true;
+ }
++static bool fold_mov(OptContext *ctx, TCGOp *op)
++{
++    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++}
++
+ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ {
+     TCGOpcode opc = op->opc;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Propagate constants through copy operations and do constant
+-           folding.  Constants will be substituted to arguments by register
+-           allocator where needed and possible.  Also detect copies. */
++        /*
++         * Process each opcode.
++         * Sorted alphabetically by opcode as much as possible.
++         */
+         switch (opc) {
+-        CASE_OP_32_64_VEC(mov):
+-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-            break;
+-
+-        default:
+-            break;
+-
+-        /* ---------------------------------------------------------- */
+-        /* Sorted alphabetically by opcode as much as possible. */
+-
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         case INDEX_op_mb:
+             done = fold_mb(&ctx, op);
+             break;
++        CASE_OP_32_64_VEC(mov):
++            done = fold_mov(&ctx, op);
++            break;
+         CASE_OP_32_64(movcond):
+             done = fold_movcond(&ctx, op);
+             break;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
+             break;
++        default:
++            break;
+         }
+         if (!done) {
+--
+.25.1

-New patch
+[PULL 35/56] tcg/optimize: Split out fold_xx_to_i
+Pull the "op r, a, a => movi r, 0" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
+file changed, 24 insertions(+), 17 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++/* If the binary operation has both arguments equal, fold to @i. */
++static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (args_are_copies(op->args[1], op->args[2])) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /*
+  * These outermost fold_<op> functions are sorted alphabetically.
+  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_brcond(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ /* Propagate constants and copies, fold constant expressions. */
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(andc):
+-        CASE_OP_32_64_VEC(sub):
+-        CASE_OP_32_64_VEC(xor):
+-            if (args_are_copies(op->args[1], op->args[2])) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-[PULL 11/16] cputlb: Merge and move memory_notdirty_write_{prepare, complete}
+[PULL 36/56] tcg/optimize: Split out fold_xx_to_x
-Since 9458a9a1df1a, all readers of the dirty bitmaps wait
+Pull the "op r, a, a => mov r, a" optimization into a function,
-for the rcu lock, which means that they wait until the end
+and use it in the outer opcode fold functions.
 of any executing TranslationBlock.
-As a consequence, there is no need for the actual access
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
-to happen in between the _prepare and _complete.  Therefore,
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 we can improve things by merging the two functions into
 notdirty_write and dropping the NotDirtyInfo structure.
 In addition, the only users of notdirty_write are in cputlb.c,
 so move the merged function there.  Pass in the CPUIOTLBEntry
 from which the ram_addr_t may be computed.
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/memory-internal.h | 65 -----------------------------
+ tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
- accel/tcg/cputlb.c             | 76 +++++++++++++++++++---------------
+file changed, 24 insertions(+), 15 deletions(-)
  exec.c                         | 44 --------------------
 files changed, 42 insertions(+), 143 deletions(-)
-diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/memory-internal.h
+--- a/tcg/optimize.c
-+++ b/include/exec/memory-internal.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void address_space_dispatch_free(AddressSpaceDispatch *d);
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+     return false;
  void mtree_print_dispatch(struct AddressSpaceDispatch *d,
                            MemoryRegion *root);
 -
 -struct page_collection;
 -
 -/* Opaque struct for passing info from memory_notdirty_write_prepare()
 - * to memory_notdirty_write_complete(). Callers should treat all fields
 - * as private, with the exception of @active.
 - *
 - * @active is a field which is not touched by either the prepare or
 - * complete functions, but which the caller can use if it wishes to
 - * track whether it has called prepare for this struct and so needs
 - * to later call the complete function.
 - */
 -typedef struct {
 -    CPUState *cpu;
 -    struct page_collection *pages;
 -    ram_addr_t ram_addr;
 -    vaddr mem_vaddr;
 -    unsigned size;
 -    bool active;
 -} NotDirtyInfo;
 -
 -/**
 - * memory_notdirty_write_prepare: call before writing to non-dirty memory
 - * @ndi: pointer to opaque NotDirtyInfo struct
 - * @cpu: CPU doing the write
 - * @mem_vaddr: virtual address of write
 - * @ram_addr: the ram address of the write
 - * @size: size of write in bytes
 - *
 - * Any code which writes to the host memory corresponding to
 - * guest RAM which has been marked as NOTDIRTY must wrap those
 - * writes in calls to memory_notdirty_write_prepare() and
 - * memory_notdirty_write_complete():
 - *
 - *  NotDirtyInfo ndi;
 - *  memory_notdirty_write_prepare(&ndi, ....);
 - *  ... perform write here ...
 - *  memory_notdirty_write_complete(&ndi);
 - *
 - * These calls will ensure that we flush any TCG translated code for
 - * the memory being written, update the dirty bits and (if possible)
 - * remove the slowpath callback for writing to the memory.
 - *
 - * This must only be called if we are using TCG; it will assert otherwise.
 - *
 - * We may take locks in the prepare call, so callers must ensure that
 - * they don't exit (via longjump or otherwise) without calling complete.
 - *
 - * This call must only be made inside an RCU critical section.
 - * (Note that while we're executing a TCG TB we're always in an
 - * RCU critical section, which is likely to be the case for callers
 - * of these functions.)
 - */
 -void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
 -                                   CPUState *cpu,
 -                                   vaddr mem_vaddr,
 -                                   ram_addr_t ram_addr,
 -                                   unsigned size);
 -/**
 - * memory_notdirty_write_complete: finish write to non-dirty memory
 - * @ndi: pointer to the opaque NotDirtyInfo struct which was initialized
 - * by memory_not_dirty_write_prepare().
 - */
 -void memory_notdirty_write_complete(NotDirtyInfo *ndi);
 -
  #endif
  #endif
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
  #include "exec/helper-proto.h"
  #include "qemu/atomic.h"
  #include "qemu/atomic128.h"
 +#include "translate-all.h"
  /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
  /* #define DEBUG_TLB */
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
      return qemu_ram_addr_from_host_nofail(p);
  }
-+static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
++/* If the binary operation has both arguments equal, fold to identity. */
-+                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
++static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 +{
-+    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
++    if (args_are_copies(op->args[1], op->args[2])) {
-+
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
 +    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
 +
 +    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
 +        struct page_collection *pages
 +            = page_collection_lock(ram_addr, ram_addr + size);
 +
 +        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
 +        cpu->mem_io_pc = retaddr;
 +
 +        tb_invalidate_phys_page_fast(pages, ram_addr, size);
 +        page_collection_unlock(pages);
 +    }
-+
++    return false;
 +    /*
 +     * Set both VGA and migration bits for simplicity and to remove
 +     * the notdirty callback faster.
 +     */
 +    cpu_physical_memory_set_dirty_range(ram_addr, size, DIRTY_CLIENTS_NOCODE);
 +
 +    /* We remove the notdirty callback only if the code has been flushed. */
 +    if (!cpu_physical_memory_is_clean(ram_addr)) {
 +        trace_memory_notdirty_set_dirty(mem_vaddr);
 +        tlb_set_dirty(cpu, mem_vaddr);
 +    }
 +}
 +
  /*
-  * Probe for whether the specified guest access is permitted. If it is not
+  * These outermost fold_<op> functions are sorted alphabetically.
-  * permitted then an exception will be taken in the same way as if this
++ *
-@@ -XXX,XX +XXX,XX @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
++ * The ordering of the transformations should be:
- /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
++ *   1) those that produce a constant
-  * operations, or io operations to proceed.  Return the host address.  */
++ *   2) those that produce a copy
- static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
++ *   3) those that produce information about the result value.
--                               TCGMemOpIdx oi, uintptr_t retaddr,
+  */
--                               NotDirtyInfo *ndi)
-+                               TCGMemOpIdx oi, uintptr_t retaddr)
+ static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
-     size_t mmu_idx = get_mmuidx(oi);
+-    return fold_const2(ctx, op);
-     uintptr_t index = tlb_index(env, mmu_idx, addr);
++    if (fold_const2(ctx, op) ||
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
++        fold_xx_to_x(ctx, op)) {
++        return true;
-     hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
++    }
++    return false;
--    ndi->active = false;
+ }
-     if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
--        ndi->active = true;
+ static bool fold_andc(OptContext *ctx, TCGOp *op)
--        memory_notdirty_write_prepare(ndi, env_cpu(env), addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
--                                      qemu_ram_addr_from_host_nofail(hostaddr),
--                                      1 << s_bits);
+ static bool fold_or(OptContext *ctx, TCGOp *op)
-+        notdirty_write(env_cpu(env), addr, 1 << s_bits,
+ {
-+                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
+-    return fold_const2(ctx, op);
-     }
++    if (fold_const2(ctx, op) ||
++        fold_xx_to_x(ctx, op)) {
-     return hostaddr;
++        return true;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
++    }
-             return;
++    return false;
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
--        haddr = (void *)((uintptr_t)addr + entry->addend);
+-        /* Simplify expression for "op r, a, a => mov r, a" cases */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(and):
 -            if (args_are_copies(op->args[1], op->args[2])) {
 -                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -                continue;
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
-         /* Handle clean RAM pages.  */
-         if (tlb_addr & TLB_NOTDIRTY) {
--            NotDirtyInfo ndi;
--
--            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
--            env_cpu(env)->mem_io_pc = retaddr;
--
--            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
--                                          addr + iotlbentry->addr, size);
--
--            if (unlikely(need_swap)) {
--                store_memop(haddr, val, op ^ MO_BSWAP);
--            } else {
--                store_memop(haddr, val, op);
--            }
--
--            memory_notdirty_write_complete(&ndi);
--            return;
-+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
-         }
-+        haddr = (void *)((uintptr_t)addr + entry->addend);
-+
          /*
-          * Keep these two store_memop separate to ensure that the compiler
+          * Process each opcode.
-          * is able to fold the entire function to a single instruction.
+          * Sorted alphabetically by opcode as much as possible.
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
  #define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
  #define ATOMIC_NAME(X) \
      HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
 -#define ATOMIC_MMU_DECLS NotDirtyInfo ndi
 -#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr, &ndi)
 -#define ATOMIC_MMU_CLEANUP                              \
 -    do {                                                \
 -        if (unlikely(ndi.active)) {                     \
 -            memory_notdirty_write_complete(&ndi);       \
 -        }                                               \
 -    } while (0)
 +#define ATOMIC_MMU_DECLS
 +#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr)
 +#define ATOMIC_MMU_CLEANUP
  #define DATA_SIZE 1
  #include "atomic_template.h"
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
  #undef ATOMIC_MMU_LOOKUP
  #define EXTRA_ARGS         , TCGMemOpIdx oi
  #define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
 -#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC(), &ndi)
 +#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
  #define DATA_SIZE 1
  #include "atomic_template.h"
 diff --git a/exec.c b/exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/exec.c
 +++ b/exec.c
@@ -XXX,XX +XXX,XX @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
      return block->offset + offset;
  }
 -/* Called within RCU critical section. */
 -void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
 -                          CPUState *cpu,
 -                          vaddr mem_vaddr,
 -                          ram_addr_t ram_addr,
 -                          unsigned size)
 -{
 -    ndi->cpu = cpu;
 -    ndi->ram_addr = ram_addr;
 -    ndi->mem_vaddr = mem_vaddr;
 -    ndi->size = size;
 -    ndi->pages = NULL;
 -
 -    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
 -
 -    assert(tcg_enabled());
 -    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
 -        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
 -        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
 -    }
 -}
 -
 -/* Called within RCU critical section. */
 -void memory_notdirty_write_complete(NotDirtyInfo *ndi)
 -{
 -    if (ndi->pages) {
 -        assert(tcg_enabled());
 -        page_collection_unlock(ndi->pages);
 -        ndi->pages = NULL;
 -    }
 -
 -    /* Set both VGA and migration bits for simplicity and to remove
 -     * the notdirty callback faster.
 -     */
 -    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
 -                                        DIRTY_CLIENTS_NOCODE);
 -    /* we remove the notdirty callback only if the code has been
 -       flushed */
 -    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
 -        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
 -        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
 -    }
 -}
 -
  /* Generate a debug exception if a watchpoint has been hit.  */
  void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                            MemTxAttrs attrs, int flags, uintptr_t ra)
 --
-.17.1
+.25.1

-New patch
+[PULL 37/56] tcg/optimize: Split out fold_xi_to_i
+Pull the "op r, a, 0 => movi r, 0" optimization into a function,
+and use it in the outer opcode fold functions.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 38 ++++++++++++++++++++------------------
+file changed, 20 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
+     return false;
+ }
++/* If the binary operation has second argument @i, fold to @i. */
++static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /* If the binary operation has both arguments equal, fold to @i. */
+ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_i(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             continue;
+         }
+-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(and):
+-        CASE_OP_32_64_VEC(mul):
+-        CASE_OP_32_64(muluh):
+-        CASE_OP_32_64(mulsh):
+-            if (arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /*
+          * Process each opcode.
+          * Sorted alphabetically by opcode as much as possible.
+--
+.25.1

-New patch
+[PULL 38/56] tcg/optimize: Add type to OptContext
+Compute the type of the operation early.
 There are at least 4 places that used a def->flags ladder
 to determine the type of the operation being optimized.
 There were two places that assumed !TCG_OPF_64BIT means
 TCG_TYPE_I32, and so could potentially compute incorrect
 results for vector operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 file changed, 89 insertions(+), 60 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t z_mask;
 +    TCGType type;
  } OptContext;
  static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  {
      TCGTemp *dst_ts = arg_temp(dst);
      TCGTemp *src_ts = arg_temp(src);
 -    const TCGOpDef *def;
      TempOptInfo *di;
      TempOptInfo *si;
      uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      reset_ts(dst_ts);
      di = ts_info(dst_ts);
      si = ts_info(src_ts);
 -    def = &tcg_op_defs[op->opc];
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        new_op = INDEX_op_mov_vec;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        new_op = INDEX_op_mov_i64;
 -    } else {
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
          new_op = INDEX_op_mov_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        new_op = INDEX_op_mov_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
 +        new_op = INDEX_op_mov_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
      }
      op->opc = new_op;
 -    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
      op->args[0] = dst;
      op->args[1] = src;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
  static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                               TCGArg dst, uint64_t val)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op->opc];
 -    TCGType type;
 -    TCGTemp *tv;
 -
 -    if (def->flags & TCG_OPF_VECTOR) {
 -        type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -    } else if (def->flags & TCG_OPF_64BIT) {
 -        type = TCG_TYPE_I64;
 -    } else {
 -        type = TCG_TYPE_I32;
 -    }
 -
      /* Convert movi to mov with constant temp. */
 -    tv = tcg_constant_internal(type, val);
 +    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
 +
      init_ts_info(ctx, tv);
      return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
  }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
      }
  }
 -static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
 +static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
 +                                    uint64_t x, uint64_t y)
  {
 -    const TCGOpDef *def = &tcg_op_defs[op];
      uint64_t res = do_constant_folding_2(op, x, y);
 -    if (!(def->flags & TCG_OPF_64BIT)) {
 +    if (type == TCG_TYPE_I32) {
          res = (int32_t)res;
      }
      return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
   * Return -1 if the condition can't be simplified,
   * and the result of the condition (0 or 1) if it can.
   */
 -static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
 +static int do_constant_folding_cond(TCGType type, TCGArg x,
                                      TCGArg y, TCGCond c)
  {
      uint64_t xv = arg_info(x)->val;
      uint64_t yv = arg_info(y)->val;
      if (arg_is_const(x) && arg_is_const(y)) {
 -        const TCGOpDef *def = &tcg_op_defs[op];
 -        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
 -        if (def->flags & TCG_OPF_64BIT) {
 -            return do_constant_folding_cond_64(xv, yv, c);
 -        } else {
 +        switch (type) {
 +        case TCG_TYPE_I32:
              return do_constant_folding_cond_32(xv, yv, c);
 +        case TCG_TYPE_I64:
 +            return do_constant_folding_cond_64(xv, yv, c);
 +        default:
 +            /* Only scalar comparisons are optimizable */
 +            return -1;
          }
      } else if (args_are_copies(x, y)) {
          return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, 0);
 +        t = do_constant_folding(op->opc, ctx->type, t, 0);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
          uint64_t t1 = arg_info(op->args[1])->val;
          uint64_t t2 = arg_info(op->args[2])->val;
 -        t1 = do_constant_folding(op->opc, t1, t2);
 +        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                       op->args[2], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
              goto do_brcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
 -        t = do_constant_folding(op->opc, t, op->args[2]);
 +        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          uint64_t t = arg_info(op->args[1])->val;
          if (t != 0) {
 -            t = do_constant_folding(op->opc, t, 0);
 +            t = do_constant_folding(op->opc, ctx->type, t, 0);
              return tcg_opt_gen_movi(ctx, op, op->args[0], t);
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
 -    TCGOpcode opc = op->opc;
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
 +        TCGOpcode opc;
 -        opc = (opc == INDEX_op_movcond_i32
 -               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
 +        switch (ctx->type) {
 +        case TCG_TYPE_I32:
 +            opc = INDEX_op_setcond_i32;
 +            break;
 +        case TCG_TYPE_I64:
 +            opc = INDEX_op_setcond_i64;
 +            break;
 +        default:
 +            g_assert_not_reached();
 +        }
          if (tv == 1 && fv == 0) {
              op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
 +    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
           * Simplify EQ/NE comparisons where one of the pairs
           * can be simplified.
           */
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                       op->args[3], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
              goto do_setcond_high;
          }
 -        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
 +        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                       op->args[4], cond);
          switch (i ^ inv) {
          case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
          copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 +        /* Pre-compute the type of the operation. */
 +        if (def->flags & TCG_OPF_VECTOR) {
 +            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
 +        } else if (def->flags & TCG_OPF_64BIT) {
 +            ctx.type = TCG_TYPE_I64;
 +        } else {
 +            ctx.type = TCG_TYPE_I32;
 +        }
 +
          /* For commutative operations make constant second argument */
          switch (opc) {
          CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                      /* Proceed with possible constant folding. */
                      break;
                  }
 -                if (opc == INDEX_op_sub_i32) {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      neg_op = INDEX_op_neg_i32;
                      have_neg = TCG_TARGET_HAS_neg_i32;
 -                } else if (opc == INDEX_op_sub_i64) {
 +                    break;
 +                case TCG_TYPE_I64:
                      neg_op = INDEX_op_neg_i64;
                      have_neg = TCG_TARGET_HAS_neg_i64;
 -                } else if (TCG_TARGET_HAS_neg_vec) {
 -                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
 -                    unsigned vece = TCGOP_VECE(op);
 -                    neg_op = INDEX_op_neg_vec;
 -                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
 -                } else {
                      break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    neg_op = INDEX_op_neg_vec;
 +                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
 +                                                   TCGOP_VECE(op)) > 0;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_neg) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  TCGOpcode not_op;
                  bool have_not;
 -                if (def->flags & TCG_OPF_VECTOR) {
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                } else if (def->flags & TCG_OPF_64BIT) {
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                } else {
 +                switch (ctx.type) {
 +                case TCG_TYPE_I32:
                      not_op = INDEX_op_not_i32;
                      have_not = TCG_TARGET_HAS_not_i32;
 +                    break;
 +                case TCG_TYPE_I64:
 +                    not_op = INDEX_op_not_i64;
 +                    have_not = TCG_TARGET_HAS_not_i64;
 +                    break;
 +                case TCG_TYPE_V64:
 +                case TCG_TYPE_V128:
 +                case TCG_TYPE_V256:
 +                    not_op = INDEX_op_not_vec;
 +                    have_not = TCG_TARGET_HAS_not_vec;
 +                    break;
 +                default:
 +                    g_assert_not_reached();
                  }
                  if (!have_not) {
                      break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             below, we can ignore high bits, but for further optimizations we
             need to record that the high bits contain garbage.  */
          partmask = z_mask;
 -        if (!(def->flags & TCG_OPF_64BIT)) {
 +        if (ctx.type == TCG_TYPE_I32) {
              z_mask |= ~(tcg_target_ulong)0xffffffffu;
              partmask &= 0xffffffffu;
              affected &= 0xffffffffu;
 --
 .25.1

-New patch
+[PULL 39/56] tcg/optimize: Split out fold_to_not
+Split out the conditional conversion from a more complex logical
 operation to a simple NOT.  Create a couple more helpers to make
 this easy for the outer-most logical operations.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 file changed, 86 insertions(+), 72 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +/*
 + * Convert @op to NOT, if NOT is supported by the host.
 + * Return true f the conversion is successful, which will still
 + * indicate that the processing is complete.
 + */
 +static bool fold_not(OptContext *ctx, TCGOp *op);
 +static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
 +{
 +    TCGOpcode not_op;
 +    bool have_not;
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        not_op = INDEX_op_not_i32;
 +        have_not = TCG_TARGET_HAS_not_i32;
 +        break;
 +    case TCG_TYPE_I64:
 +        not_op = INDEX_op_not_i64;
 +        have_not = TCG_TARGET_HAS_not_i64;
 +        break;
 +    case TCG_TYPE_V64:
 +    case TCG_TYPE_V128:
 +    case TCG_TYPE_V256:
 +        not_op = INDEX_op_not_vec;
 +        have_not = TCG_TARGET_HAS_not_vec;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    if (have_not) {
 +        op->opc = not_op;
 +        op->args[1] = op->args[idx];
 +        return fold_not(ctx, op);
 +    }
 +    return false;
 +}
 +
 +/* If the binary operation has first argument @i, fold to NOT. */
 +static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
 +        return fold_to_not(ctx, op, 2);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has second argument @i, fold to @i. */
  static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
      return false;
  }
 +/* If the binary operation has second argument @i, fold to NOT. */
 +static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 +{
 +    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
 +        return fold_to_not(ctx, op, 1);
 +    }
 +    return false;
 +}
 +
  /* If the binary operation has both arguments equal, fold to @i. */
  static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
  {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, -1)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_xi_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_not(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    /* Because of fold_to_not, we want to always return true, via finish. */
 +    finish_folding(ctx, op);
 +    return true;
  }
  static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
  static bool fold_orc(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const2(ctx, op);
 +    if (fold_const2(ctx, op) ||
 +        fold_ix_to_not(ctx, op, 0)) {
 +        return true;
 +    }
 +    return false;
  }
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
 -        fold_xx_to_i(ctx, op, 0)) {
 +        fold_xx_to_i(ctx, op, 0) ||
 +        fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
      return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  }
              }
              break;
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(nand):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == -1) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64(nor):
 -            if (!arg_is_const(op->args[1])
 -                && arg_is_const(op->args[2])
 -                && arg_info(op->args[2])->val == 0) {
 -                i = 1;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(andc):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == -1) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        CASE_OP_32_64_VEC(orc):
 -        CASE_OP_32_64(eqv):
 -            if (!arg_is_const(op->args[2])
 -                && arg_is_const(op->args[1])
 -                && arg_info(op->args[1])->val == 0) {
 -                i = 2;
 -                goto try_not;
 -            }
 -            break;
 -        try_not:
 -            {
 -                TCGOpcode not_op;
 -                bool have_not;
 -
 -                switch (ctx.type) {
 -                case TCG_TYPE_I32:
 -                    not_op = INDEX_op_not_i32;
 -                    have_not = TCG_TARGET_HAS_not_i32;
 -                    break;
 -                case TCG_TYPE_I64:
 -                    not_op = INDEX_op_not_i64;
 -                    have_not = TCG_TARGET_HAS_not_i64;
 -                    break;
 -                case TCG_TYPE_V64:
 -                case TCG_TYPE_V128:
 -                case TCG_TYPE_V256:
 -                    not_op = INDEX_op_not_vec;
 -                    have_not = TCG_TARGET_HAS_not_vec;
 -                    break;
 -                default:
 -                    g_assert_not_reached();
 -                }
 -                if (!have_not) {
 -                    break;
 -                }
 -                op->opc = not_op;
 -                reset_temp(op->args[0]);
 -                op->args[1] = op->args[i];
 -                continue;
 -            }
          default:
              break;
          }
 --
 .25.1

-New patch
+[PULL 40/56] tcg/optimize: Split out fold_sub_to_neg
+Even though there is only one user, place this more complex
+conversion into its own helper.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
+file changed, 47 insertions(+), 42 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
+ static bool fold_neg(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const1(ctx, op);
++    if (fold_const1(ctx, op)) {
++        return true;
++    }
++    /*
++     * Because of fold_sub_to_neg, we want to always return true,
++     * via finish_folding.
++     */
++    finish_folding(ctx, op);
++    return true;
+ }
+ static bool fold_nor(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
+     return fold_const2(ctx, op);
+ }
++static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
++{
++    TCGOpcode neg_op;
++    bool have_neg;
++
++    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
++        return false;
++    }
++
++    switch (ctx->type) {
++    case TCG_TYPE_I32:
++        neg_op = INDEX_op_neg_i32;
++        have_neg = TCG_TARGET_HAS_neg_i32;
++        break;
++    case TCG_TYPE_I64:
++        neg_op = INDEX_op_neg_i64;
++        have_neg = TCG_TARGET_HAS_neg_i64;
++        break;
++    case TCG_TYPE_V64:
++    case TCG_TYPE_V128:
++    case TCG_TYPE_V256:
++        neg_op = INDEX_op_neg_vec;
++        have_neg = (TCG_TARGET_HAS_neg_vec &&
++                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
++        break;
++    default:
++        g_assert_not_reached();
++    }
++    if (have_neg) {
++        op->opc = neg_op;
++        op->args[1] = op->args[2];
++        return fold_neg(ctx, op);
++    }
++    return false;
++}
++
+ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+-        fold_xx_to_i(ctx, op, 0)) {
++        fold_xx_to_i(ctx, op, 0) ||
++        fold_sub_to_neg(ctx, op)) {
+         return true;
+     }
+     return false;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+                 continue;
+             }
+             break;
+-        CASE_OP_32_64_VEC(sub):
+-            {
+-                TCGOpcode neg_op;
+-                bool have_neg;
+-
+-                if (arg_is_const(op->args[2])) {
+-                    /* Proceed with possible constant folding. */
+-                    break;
+-                }
+-                switch (ctx.type) {
+-                case TCG_TYPE_I32:
+-                    neg_op = INDEX_op_neg_i32;
+-                    have_neg = TCG_TARGET_HAS_neg_i32;
+-                    break;
+-                case TCG_TYPE_I64:
+-                    neg_op = INDEX_op_neg_i64;
+-                    have_neg = TCG_TARGET_HAS_neg_i64;
+-                    break;
+-                case TCG_TYPE_V64:
+-                case TCG_TYPE_V128:
+-                case TCG_TYPE_V256:
+-                    neg_op = INDEX_op_neg_vec;
+-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+-                                                   TCGOP_VECE(op)) > 0;
+-                    break;
+-                default:
+-                    g_assert_not_reached();
+-                }
+-                if (!have_neg) {
+-                    break;
+-                }
+-                if (arg_is_const(op->args[1])
+-                    && arg_info(op->args[1])->val == 0) {
+-                    op->opc = neg_op;
+-                    reset_temp(op->args[0]);
+-                    op->args[1] = op->args[2];
+-                    continue;
+-                }
+-            }
+-            break;
+         default:
+             break;
+         }
+--
+.25.1

-New patch
+[PULL 41/56] tcg/optimize: Split out fold_xi_to_x
+Pull the "op r, a, i => mov r, a" optimization into a function,
+and use them in the outer-most logical operations.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
+file changed, 26 insertions(+), 35 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+     return false;
+ }
++/* If the binary operation has second argument @i, fold to identity. */
++static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
++    }
++    return false;
++}
++
+ /* If the binary operation has second argument @i, fold to NOT. */
+ static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+ static bool fold_add(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xi_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, -1) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_ix_to_not(ctx, op, -1)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
+ static bool fold_eqv(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, -1) ||
+         fold_xi_to_not(ctx, op, 0)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
+ static bool fold_or(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_xx_to_x(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, -1) ||
+         fold_ix_to_not(ctx, op, 0)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 0)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_sub_to_neg(ctx, op)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+         fold_xx_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 0) ||
+         fold_xi_to_not(ctx, op, -1)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expression for "op r, a, const => mov r, a" cases */
+-        switch (opc) {
+-        CASE_OP_32_64_VEC(add):
+-        CASE_OP_32_64_VEC(sub):
+-        CASE_OP_32_64_VEC(or):
+-        CASE_OP_32_64_VEC(xor):
+-        CASE_OP_32_64_VEC(andc):
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
+-        CASE_OP_32_64(sar):
+-        CASE_OP_32_64(rotl):
+-        CASE_OP_32_64(rotr):
+-            if (!arg_is_const(op->args[1])
+-                && arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == 0) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-                continue;
+-            }
+-            break;
+-        CASE_OP_32_64_VEC(and):
+-        CASE_OP_32_64_VEC(orc):
+-        CASE_OP_32_64(eqv):
+-            if (!arg_is_const(op->args[1])
+-                && arg_is_const(op->args[2])
+-                && arg_info(op->args[2])->val == -1) {
+-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
+         z_mask = -1;
+--
+.25.1

-New patch
+[PULL 42/56] tcg/optimize: Split out fold_ix_to_i
+Pull the "op r, 0, b => movi r, 0" optimization into a function,
+and use it in fold_shift.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 28 ++++++++++------------------
+file changed, 10 insertions(+), 18 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+     return false;
+ }
++/* If the binary operation has first argument @i, fold to @i. */
++static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
++{
++    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
++        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
++    }
++    return false;
++}
++
+ /* If the binary operation has first argument @i, fold to NOT. */
+ static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+ {
+@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
+ static bool fold_shift(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
++        fold_ix_to_i(ctx, op, 0) ||
+         fold_xi_to_x(ctx, op, 0)) {
+         return true;
+     }
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+             break;
+         }
+-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
+-           and "sub r, 0, a => neg r, a" case.  */
+-        switch (opc) {
+-        CASE_OP_32_64(shl):
+-        CASE_OP_32_64(shr):
+-        CASE_OP_32_64(sar):
+-        CASE_OP_32_64(rotl):
+-        CASE_OP_32_64(rotr):
+-            if (arg_is_const(op->args[1])
+-                && arg_info(op->args[1])->val == 0) {
+-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
+-                continue;
+-            }
+-            break;
+-        default:
+-            break;
+-        }
+-
+         /* Simplify using known-zero bits. Currently only ops with a single
+            output argument is supported. */
+         z_mask = -1;
+--
+.25.1

-[PULL 02/16] cputlb: Disable __always_inline__ without optimization
+[PULL 43/56] tcg/optimize: Split out fold_masks
-This forced inlining can result in missing symbols,
+Move all of the known-zero optimizations into the per-opcode
-which makes a debugging build harder to follow.
+functions.  Use fold_masks when there is a possibility of the
 result being determined, and simply set ctx->z_mask otherwise.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Reported-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/qemu/compiler.h | 11 +++++++++++
+ tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
- accel/tcg/cputlb.c      |  4 ++--
+file changed, 294 insertions(+), 251 deletions(-)
 files changed, 13 insertions(+), 2 deletions(-)
-diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/compiler.h
+--- a/tcg/optimize.c
-+++ b/include/qemu/compiler.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
- # define QEMU_NONSTRING
+     TCGTempSet temps_used;
- #endif
+     /* In flight values from optimization. */
-+/*
+-    uint64_t z_mask;
-+ * Forced inlining may be desired to encourage constant propagation
++    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
-+ * of function parameters.  However, it can also make debugging harder,
++    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
-+ * so disable it for a non-optimizing build.
+     TCGType type;
-+ */
+ } OptContext;
-+#if defined(__OPTIMIZE__)
-+#define QEMU_ALWAYS_INLINE  __attribute__((always_inline))
+@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
-+#else
+     return false;
-+#define QEMU_ALWAYS_INLINE
+ }
-+#endif
-+
++static bool fold_masks(OptContext *ctx, TCGOp *op)
- /* Implement C11 _Generic via GCC builtins.  Example:
++{
-  *
++    uint64_t a_mask = ctx->a_mask;
-  *    QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x)
++    uint64_t z_mask = ctx->z_mask;
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
++
-index XXXXXXX..XXXXXXX 100644
++    /*
---- a/accel/tcg/cputlb.c
++     * 32-bit ops generate 32-bit results.  For the result is zero test
-+++ b/accel/tcg/cputlb.c
++     * below, we can ignore high bits, but for further optimizations we
-@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
++     * need to record that the high bits contain garbage.
- typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
++     */
-                                 TCGMemOpIdx oi, uintptr_t retaddr);
++    if (ctx->type == TCG_TYPE_I32) {
++        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
--static inline uint64_t __attribute__((always_inline))
++        a_mask &= MAKE_64BIT_MASK(0, 32);
-+static inline uint64_t QEMU_ALWAYS_INLINE
++        z_mask &= MAKE_64BIT_MASK(0, 32);
- load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
++    }
-             uintptr_t retaddr, MemOp op, bool code_read,
++
-             FullLoadHelper *full_load)
++    if (z_mask == 0) {
-@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
++        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
-  * Store Helpers
++    }
-  */
++    if (a_mask == 0) {
++        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
--static inline void __attribute__((always_inline))
++    }
-+static inline void QEMU_ALWAYS_INLINE
++    return false;
- store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
++}
-              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
++
- {
+ /*
   * Convert @op to NOT, if NOT is supported by the host.
   * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
  static bool fold_and(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1, z2;
 +
      if (fold_const2(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +    z2 = arg_info(op->args[2])->z_mask;
 +    ctx->z_mask = z1 & z2;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer affected bits from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        ctx->a_mask = z1 & ~z2;
 +    }
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_andc(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z1;
 +
      if (fold_const2(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_ix_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    z1 = arg_info(op->args[1])->z_mask;
 +
 +    /*
 +     * Known-zeros does not imply known-ones.  Therefore unless
 +     * arg2 is constant, we can't infer anything from it.
 +     */
 +    if (arg_is_const(op->args[2])) {
 +        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
 +        ctx->a_mask = z1 & ~z2;
 +        z1 &= z2;
 +    }
 +    ctx->z_mask = z1;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask, sign;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
          t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    switch (op->opc) {
 +    case INDEX_op_bswap16_i32:
 +    case INDEX_op_bswap16_i64:
 +        z_mask = bswap16(z_mask);
 +        sign = INT16_MIN;
 +        break;
 +    case INDEX_op_bswap32_i32:
 +    case INDEX_op_bswap32_i64:
 +        z_mask = bswap32(z_mask);
 +        sign = INT32_MIN;
 +        break;
 +    case INDEX_op_bswap64_i64:
 +        z_mask = bswap64(z_mask);
 +        sign = INT64_MIN;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 +    case TCG_BSWAP_OZ:
 +        break;
 +    case TCG_BSWAP_OS:
 +        /* If the sign bit may be 1, force all the bits above to 1. */
 +        if (z_mask & sign) {
 +            z_mask |= sign;
 +        }
 +        break;
 +    default:
 +        /* The high bits are undefined: force all bits above the sign to 1. */
 +        z_mask |= sign << 1;
 +        break;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
  static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
          }
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
      }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        z_mask = 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        z_mask = 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
 +
      return false;
  }
  static bool fold_ctpop(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    switch (ctx->type) {
 +    case TCG_TYPE_I32:
 +        ctx->z_mask = 32 | 31;
 +        break;
 +    case TCG_TYPE_I64:
 +        ctx->z_mask = 64 | 63;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
  }
  static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
          t1 = deposit64(t1, op->args[3], op->args[4], t2);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
      }
 +
 +    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
 +                            op->args[3], op->args[4],
 +                            arg_info(op->args[2])->z_mask);
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
          t = extract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask, sign;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8s):
 +        sign = INT8_MIN;
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16s):
 +        sign = INT16_MIN;
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_ext_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32s_i64:
 +        sign = INT32_MIN;
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    if (z_mask & sign) {
 +        z_mask |= sign;
 +    } else if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_extu(OptContext *ctx, TCGOp *op)
  {
 -    return fold_const1(ctx, op);
 +    uint64_t z_mask_old, z_mask;
 +    bool type_change = false;
 +
 +    if (fold_const1(ctx, op)) {
 +        return true;
 +    }
 +
 +    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +
 +    switch (op->opc) {
 +    CASE_OP_32_64(ext8u):
 +        z_mask = (uint8_t)z_mask;
 +        break;
 +    CASE_OP_32_64(ext16u):
 +        z_mask = (uint16_t)z_mask;
 +        break;
 +    case INDEX_op_extrl_i64_i32:
 +    case INDEX_op_extu_i32_i64:
 +        type_change = true;
 +        QEMU_FALLTHROUGH;
 +    case INDEX_op_ext32u_i64:
 +        z_mask = (uint32_t)z_mask;
 +        break;
 +    case INDEX_op_extrh_i64_i32:
 +        type_change = true;
 +        z_mask >>= 32;
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +
 +    ctx->z_mask = z_mask;
 +    if (!type_change) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    return fold_masks(ctx, op);
  }
  static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
 +    ctx->z_mask = arg_info(op->args[3])->z_mask
 +                | arg_info(op->args[4])->z_mask;
 +
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
          uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
  static bool fold_neg(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t z_mask;
 +
      if (fold_const1(ctx, op)) {
          return true;
      }
 +
 +    /* Set to 1 all bits to the left of the rightmost.  */
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    ctx->z_mask = -(z_mask & -z_mask);
 +
      /*
       * Because of fold_sub_to_neg, we want to always return true,
       * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
          fold_xx_to_x(ctx, op)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
  static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
  {
 +    const TCGOpDef *def = &tcg_op_defs[op->opc];
 +    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 +    MemOp mop = get_memop(oi);
 +    int width = 8 * memop_size(mop);
 +
 +    if (!(mop & MO_SIGN) && width < 64) {
 +        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    }
 +
      /* Opcodes that touch guest memory stop the mb optimization.  */
      ctx->prev_mb = NULL;
      return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
 +
 +    ctx->z_mask = 1;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          op->opc = INDEX_op_setcond_i32;
          break;
      }
 +
 +    ctx->z_mask = 1;
      return false;
   do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 +    int64_t z_mask_old, z_mask;
 +
      if (arg_is_const(op->args[1])) {
          uint64_t t;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
          t = sextract64(t, op->args[2], op->args[3]);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    return false;
 +
 +    z_mask_old = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 +    if (op->args[2] == 0 && z_mask >= 0) {
 +        ctx->a_mask = z_mask_old ^ z_mask;
 +    }
 +    ctx->z_mask = z_mask;
 +
 +    return fold_masks(ctx, op);
  }
  static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
 +
 +    if (arg_is_const(op->args[2])) {
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
 +                                          arg_info(op->args[1])->z_mask,
 +                                          arg_info(op->args[2])->val);
 +        return fold_masks(ctx, op);
 +    }
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
      return fold_addsub2_i32(ctx, op, false);
  }
 +static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 +{
 +    /* We can't do any folding with a load, but we can record bits. */
 +    switch (op->opc) {
 +    CASE_OP_32_64(ld8u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        break;
 +    CASE_OP_32_64(ld16u):
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        break;
 +    case INDEX_op_ld32u_i64:
 +        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        break;
 +    default:
 +        g_assert_not_reached();
 +    }
 +    return false;
 +}
 +
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
      if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
 -    return false;
 +
 +    ctx->z_mask = arg_info(op->args[1])->z_mask
 +                | arg_info(op->args[2])->z_mask;
 +    return fold_masks(ctx, op);
  }
  /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
      }
      QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
 -        uint64_t z_mask, partmask, affected, tmp;
          TCGOpcode opc = op->opc;
          const TCGOpDef *def;
          bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              break;
          }
 -        /* Simplify using known-zero bits. Currently only ops with a single
 -           output argument is supported. */
 -        z_mask = -1;
 -        affected = -1;
 -        switch (opc) {
 -        CASE_OP_32_64(ext8s):
 -            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext8u):
 -            z_mask = 0xff;
 -            goto and_const;
 -        CASE_OP_32_64(ext16s):
 -            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        CASE_OP_32_64(ext16u):
 -            z_mask = 0xffff;
 -            goto and_const;
 -        case INDEX_op_ext32s_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_ext32u_i64:
 -            z_mask = 0xffffffffU;
 -            goto and_const;
 -
 -        CASE_OP_32_64(and):
 -            z_mask = arg_info(op->args[2])->z_mask;
 -            if (arg_is_const(op->args[2])) {
 -        and_const:
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            z_mask = arg_info(op->args[1])->z_mask & z_mask;
 -            break;
 -
 -        case INDEX_op_ext_i32_i64:
 -            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
 -                break;
 -            }
 -            QEMU_FALLTHROUGH;
 -        case INDEX_op_extu_i32_i64:
 -            /* We do not compute affected as it is a size changing op.  */
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(andc):
 -            /* Known-zeros does not imply known-ones.  Therefore unless
 -               op->args[2] is constant, we can't infer anything from it.  */
 -            if (arg_is_const(op->args[2])) {
 -                z_mask = ~arg_info(op->args[2])->z_mask;
 -                goto and_const;
 -            }
 -            /* But we certainly know nothing outside args[1] may be set. */
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            break;
 -
 -        case INDEX_op_sar_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_sar_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_shr_i32:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 31;
 -                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -        case INDEX_op_shr_i64:
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & 63;
 -                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
 -            }
 -            break;
 -
 -        case INDEX_op_extrl_i64_i32:
 -            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
 -            break;
 -        case INDEX_op_extrh_i64_i32:
 -            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
 -            break;
 -
 -        CASE_OP_32_64(shl):
 -            if (arg_is_const(op->args[2])) {
 -                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
 -                z_mask = arg_info(op->args[1])->z_mask << tmp;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(neg):
 -            /* Set to 1 all bits to the left of the rightmost.  */
 -            z_mask = -(arg_info(op->args[1])->z_mask
 -                       & -arg_info(op->args[1])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(deposit):
 -            z_mask = deposit64(arg_info(op->args[1])->z_mask,
 -                               op->args[3], op->args[4],
 -                               arg_info(op->args[2])->z_mask);
 -            break;
 -
 -        CASE_OP_32_64(extract):
 -            z_mask = extract64(arg_info(op->args[1])->z_mask,
 -                               op->args[2], op->args[3]);
 -            if (op->args[2] == 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -        CASE_OP_32_64(sextract):
 -            z_mask = sextract64(arg_info(op->args[1])->z_mask,
 -                                op->args[2], op->args[3]);
 -            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
 -                affected = arg_info(op->args[1])->z_mask & ~z_mask;
 -            }
 -            break;
 -
 -        CASE_OP_32_64(or):
 -        CASE_OP_32_64(xor):
 -            z_mask = arg_info(op->args[1])->z_mask
 -                   | arg_info(op->args[2])->z_mask;
 -            break;
 -
 -        case INDEX_op_clz_i32:
 -        case INDEX_op_ctz_i32:
 -            z_mask = arg_info(op->args[2])->z_mask | 31;
 -            break;
 -
 -        case INDEX_op_clz_i64:
 -        case INDEX_op_ctz_i64:
 -            z_mask = arg_info(op->args[2])->z_mask | 63;
 -            break;
 -
 -        case INDEX_op_ctpop_i32:
 -            z_mask = 32 | 31;
 -            break;
 -        case INDEX_op_ctpop_i64:
 -            z_mask = 64 | 63;
 -            break;
 -
 -        CASE_OP_32_64(setcond):
 -        case INDEX_op_setcond2_i32:
 -            z_mask = 1;
 -            break;
 -
 -        CASE_OP_32_64(movcond):
 -            z_mask = arg_info(op->args[3])->z_mask
 -                   | arg_info(op->args[4])->z_mask;
 -            break;
 -
 -        CASE_OP_32_64(ld8u):
 -            z_mask = 0xff;
 -            break;
 -        CASE_OP_32_64(ld16u):
 -            z_mask = 0xffff;
 -            break;
 -        case INDEX_op_ld32u_i64:
 -            z_mask = 0xffffffffu;
 -            break;
 -
 -        CASE_OP_32_64(qemu_ld):
 -            {
 -                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
 -                MemOp mop = get_memop(oi);
 -                if (!(mop & MO_SIGN)) {
 -                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
 -                }
 -            }
 -            break;
 -
 -        CASE_OP_32_64(bswap16):
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffff) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap16(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int16_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(16, 48);
 -                break;
 -            }
 -            break;
 -
 -        case INDEX_op_bswap32_i64:
 -            z_mask = arg_info(op->args[1])->z_mask;
 -            if (z_mask <= 0xffffffffu) {
 -                op->args[2] |= TCG_BSWAP_IZ;
 -            }
 -            z_mask = bswap32(z_mask);
 -            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
 -            case TCG_BSWAP_OZ:
 -                break;
 -            case TCG_BSWAP_OS:
 -                z_mask = (int32_t)z_mask;
 -                break;
 -            default: /* undefined high bits */
 -                z_mask |= MAKE_64BIT_MASK(32, 32);
 -                break;
 -            }
 -            break;
 -
 -        default:
 -            break;
 -        }
 -
 -        /* 32-bit ops generate 32-bit results.  For the result is zero test
 -           below, we can ignore high bits, but for further optimizations we
 -           need to record that the high bits contain garbage.  */
 -        partmask = z_mask;
 -        if (ctx.type == TCG_TYPE_I32) {
 -            z_mask |= ~(tcg_target_ulong)0xffffffffu;
 -            partmask &= 0xffffffffu;
 -            affected &= 0xffffffffu;
 -        }
 -        ctx.z_mask = z_mask;
 -
 -        if (partmask == 0) {
 -            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
 -            continue;
 -        }
 -        if (affected == 0) {
 -            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
 -            continue;
 -        }
 +        /* Assume all bits affected, and no bits known zero. */
 +        ctx.a_mask = -1;
 +        ctx.z_mask = -1;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32u_i64:
 +            done = fold_tcg_ld(&ctx, op);
 +            break;
          case INDEX_op_mb:
              done = fold_mb(&ctx, op);
              break;
 --
-.17.1
+.25.1

-New patch
+[PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
+Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
+and muls2_i64.
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
+file changed, 35 insertions(+), 9 deletions(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+     return false;
+ }
+-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
++static bool fold_multiply2(OptContext *ctx, TCGOp *op)
+ {
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+-        uint32_t a = arg_info(op->args[2])->val;
+-        uint32_t b = arg_info(op->args[3])->val;
+-        uint64_t r = (uint64_t)a * b;
++        uint64_t a = arg_info(op->args[2])->val;
++        uint64_t b = arg_info(op->args[3])->val;
++        uint64_t h, l;
+         TCGArg rl, rh;
+-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
++        TCGOp *op2;
++
++        switch (op->opc) {
++        case INDEX_op_mulu2_i32:
++            l = (uint64_t)(uint32_t)a * (uint32_t)b;
++            h = (int32_t)(l >> 32);
++            l = (int32_t)l;
++            break;
++        case INDEX_op_muls2_i32:
++            l = (int64_t)(int32_t)a * (int32_t)b;
++            h = l >> 32;
++            l = (int32_t)l;
++            break;
++        case INDEX_op_mulu2_i64:
++            mulu64(&l, &h, a, b);
++            break;
++        case INDEX_op_muls2_i64:
++            muls64(&l, &h, a, b);
++            break;
++        default:
++            g_assert_not_reached();
++        }
+         rl = op->args[0];
+         rh = op->args[1];
+-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
++
++        /* The proper opcode is supplied by tcg_opt_gen_mov. */
++        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
++
++        tcg_opt_gen_movi(ctx, op, rl, l);
++        tcg_opt_gen_movi(ctx, op2, rh, h);
+         return true;
+     }
+     return false;
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64(muluh):
+             done = fold_mul_highpart(&ctx, op);
+             break;
+-        case INDEX_op_mulu2_i32:
+-            done = fold_mulu2_i32(&ctx, op);
++        CASE_OP_32_64(muls2):
++        CASE_OP_32_64(mulu2):
++            done = fold_multiply2(&ctx, op);
+             break;
+         CASE_OP_32_64(nand):
+             done = fold_nand(&ctx, op);
+--
+.25.1

-[PULL 14/16] cputlb: Remove tb_invalidate_phys_page_range is_cpu_write_access
+[PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
-All callers pass false to this argument.  Remove it and pass the
+Rename to fold_addsub2.
-constant on to tb_invalidate_phys_page_range__locked.
+Use Int128 to implement the wider operation.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.h | 3 +--
+ tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
- accel/tcg/translate-all.c | 6 ++----
+file changed, 44 insertions(+), 21 deletions(-)
  exec.c                    | 4 ++--
 files changed, 5 insertions(+), 8 deletions(-)
-diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.h
+--- a/tcg/optimize.c
-+++ b/accel/tcg/translate-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ struct page_collection *page_collection_lock(tb_page_addr_t start,
+@@ -XXX,XX +XXX,XX @@
  void page_collection_unlock(struct page_collection *set);
  void tb_invalidate_phys_page_fast(struct page_collection *pages,
                                    tb_page_addr_t start, int len);
 -void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
 -                                   int is_cpu_write_access);
 +void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
  void tb_check_watchpoint(CPUState *cpu);
  #ifdef CONFIG_USER_ONLY
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
   *
   * Called with mmap_lock held for user-mode emulation
   */
--void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
--                                   int is_cpu_write_access)
+ #include "qemu/osdep.h"
-+void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end)
++#include "qemu/int128.h"
  #include "tcg/tcg-op.h"
  #include "tcg-internal.h"
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
      return false;
  }
 -static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
 +static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  {
-     struct page_collection *pages;
+     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
-     PageDesc *p;
+         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
+-        uint32_t al = arg_info(op->args[2])->val;
-         return;
+-        uint32_t ah = arg_info(op->args[3])->val;
 -        uint32_t bl = arg_info(op->args[4])->val;
 -        uint32_t bh = arg_info(op->args[5])->val;
 -        uint64_t a = ((uint64_t)ah << 32) | al;
 -        uint64_t b = ((uint64_t)bh << 32) | bl;
 +        uint64_t al = arg_info(op->args[2])->val;
 +        uint64_t ah = arg_info(op->args[3])->val;
 +        uint64_t bl = arg_info(op->args[4])->val;
 +        uint64_t bh = arg_info(op->args[5])->val;
          TCGArg rl, rh;
 -        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
 +        TCGOp *op2;
 -        if (add) {
 -            a += b;
 +        if (ctx->type == TCG_TYPE_I32) {
 +            uint64_t a = deposit64(al, 32, 32, ah);
 +            uint64_t b = deposit64(bl, 32, 32, bh);
 +
 +            if (add) {
 +                a += b;
 +            } else {
 +                a -= b;
 +            }
 +
 +            al = sextract64(a, 0, 32);
 +            ah = sextract64(a, 32, 32);
          } else {
 -            a -= b;
 +            Int128 a = int128_make128(al, ah);
 +            Int128 b = int128_make128(bl, bh);
 +
 +            if (add) {
 +                a = int128_add(a, b);
 +            } else {
 +                a = int128_sub(a, b);
 +            }
 +
 +            al = int128_getlo(a);
 +            ah = int128_gethi(a);
          }
          rl = op->args[0];
          rh = op->args[1];
 -        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
 -        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
 +
 +        /* The proper opcode is supplied by tcg_opt_gen_mov. */
 +        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
 +
 +        tcg_opt_gen_movi(ctx, op, rl, al);
 +        tcg_opt_gen_movi(ctx, op2, rh, ah);
          return true;
      }
-     pages = page_collection_lock(start, end);
+     return false;
 -    tb_invalidate_phys_page_range__locked(pages, p, start, end,
 -                                          is_cpu_write_access);
 +    tb_invalidate_phys_page_range__locked(pages, p, start, end, 0);
      page_collection_unlock(pages);
  }
-diff --git a/exec.c b/exec.c
+-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
++static bool fold_add2(OptContext *ctx, TCGOp *op)
 --- a/exec.c
 +++ b/exec.c
@@ -XXX,XX +XXX,XX @@ const char *parse_cpu_option(const char *cpu_option)
  void tb_invalidate_phys_addr(target_ulong addr)
  {
-     mmap_lock();
+-    return fold_addsub2_i32(ctx, op, true);
--    tb_invalidate_phys_page_range(addr, addr + 1, 0);
++    return fold_addsub2(ctx, op, true);
 +    tb_invalidate_phys_page_range(addr, addr + 1);
      mmap_unlock();
  }
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
+ static bool fold_and(OptContext *ctx, TCGOp *op)
-         return;
+@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
-     }
+     return false;
      ram_addr = memory_region_get_ram_addr(mr) + addr;
 -    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1, 0);
 +    tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
      rcu_read_unlock();
  }
+-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
++static bool fold_sub2(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_addsub2_i32(ctx, op, false);
++    return fold_addsub2(ctx, op, false);
+ }
+ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(add):
+             done = fold_add(&ctx, op);
+             break;
+-        case INDEX_op_add2_i32:
+-            done = fold_add2_i32(&ctx, op);
++        CASE_OP_32_64(add2):
++            done = fold_add2(&ctx, op);
+             break;
+         CASE_OP_32_64_VEC(and):
+             done = fold_and(&ctx, op);
+@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
+         CASE_OP_32_64_VEC(sub):
+             done = fold_sub(&ctx, op);
+             break;
+-        case INDEX_op_sub2_i32:
+-            done = fold_sub2_i32(&ctx, op);
++        CASE_OP_32_64(sub2):
++            done = fold_sub2(&ctx, op);
+             break;
+         CASE_OP_32_64_VEC(xor):
+             done = fold_xor(&ctx, op);
 --
-.17.1
+.25.1

-New patch
+[PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions
+Most of these are handled by creating a fold_const2_commutative
 to handle all of the binary operators.  The rest were already
 handled on a case-by-case basis in the switch, and have their
 own fold function in which to place the call.
 We now have only one major switch on TCGOpcode.
 Introduce NO_DEST and a block comment for swap_commutative in
 order to make the handling of brcond and movcond opcodes cleaner.
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 file changed, 70 insertions(+), 72 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
      return -1;
  }
 +/**
 + * swap_commutative:
 + * @dest: TCGArg of the destination argument, or NO_DEST.
 + * @p1: first paired argument
 + * @p2: second paired argument
 + *
 + * If *@p1 is a constant and *@p2 is not, swap.
 + * If *@p2 matches @dest, swap.
 + * Return true if a swap was performed.
 + */
 +
 +#define NO_DEST  temp_arg(NULL)
 +
  static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
  {
      TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
      return false;
  }
 +static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 +{
 +    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 +    return fold_const2(ctx, op);
 +}
 +
  static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
  static bool fold_add(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
  static bool fold_add2(OptContext *ctx, TCGOp *op)
  {
 +    /* Note that the high and low parts may be independently swapped. */
 +    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 +    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 +
      return fold_addsub2(ctx, op, true);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
  {
      uint64_t z1, z2;
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  static bool fold_brcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[2];
 -    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
 +        op->args[2] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
      if (i == 0) {
          tcg_op_remove(ctx->tcg, op);
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
  static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[4];
 -    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      TCGArg label = op->args[5];
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[0], &op->args[2])) {
 +        op->args[4] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
      if (i >= 0) {
          goto do_brcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
  static bool fold_eqv(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, -1) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
  static bool fold_movcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +    /*
 +     * Canonicalize the "false" input reg to match the destination reg so
 +     * that the tcg backend can implement a "move if true" operation.
 +     */
 +    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 +        op->args[5] = cond = tcg_invert_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
  static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_i(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
  static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  {
 +    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 +
      if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
          uint64_t a = arg_info(op->args[2])->val;
          uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
  static bool fold_nand(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
  static bool fold_nor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
  static bool fold_or(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xx_to_x(ctx, op)) {
          return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  static bool fold_setcond(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[3];
 -    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 +    int i;
 +    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 +        op->args[3] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
      if (i >= 0) {
          return tcg_opt_gen_movi(ctx, op, op->args[0], i);
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
  static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  {
      TCGCond cond = op->args[5];
 -    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
 -    int inv = 0;
 +    int i, inv = 0;
 +    if (swap_commutative2(&op->args[1], &op->args[3])) {
 +        op->args[5] = cond = tcg_swap_cond(cond);
 +    }
 +
 +    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
      if (i >= 0) {
          goto do_setcond_const;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  static bool fold_xor(OptContext *ctx, TCGOp *op)
  {
 -    if (fold_const2(ctx, op) ||
 +    if (fold_const2_commutative(ctx, op) ||
          fold_xx_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0) ||
          fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* For commutative operations make constant second argument */
 -        switch (opc) {
 -        CASE_OP_32_64_VEC(add):
 -        CASE_OP_32_64_VEC(mul):
 -        CASE_OP_32_64_VEC(and):
 -        CASE_OP_32_64_VEC(or):
 -        CASE_OP_32_64_VEC(xor):
 -        CASE_OP_32_64(eqv):
 -        CASE_OP_32_64(nand):
 -        CASE_OP_32_64(nor):
 -        CASE_OP_32_64(muluh):
 -        CASE_OP_32_64(mulsh):
 -            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
 -            break;
 -        CASE_OP_32_64(brcond):
 -            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
 -                op->args[2] = tcg_swap_cond(op->args[2]);
 -            }
 -            break;
 -        CASE_OP_32_64(setcond):
 -            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
 -                op->args[3] = tcg_swap_cond(op->args[3]);
 -            }
 -            break;
 -        CASE_OP_32_64(movcond):
 -            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            /* For movcond, we canonicalize the "false" input reg to match
 -               the destination reg so that the tcg backend can implement
 -               a "move if true" operation.  */
 -            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
 -                op->args[5] = tcg_invert_cond(op->args[5]);
 -            }
 -            break;
 -        CASE_OP_32_64(add2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
 -            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
 -            break;
 -        CASE_OP_32_64(mulu2):
 -        CASE_OP_32_64(muls2):
 -            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
 -            break;
 -        case INDEX_op_brcond2_i32:
 -            if (swap_commutative2(&op->args[0], &op->args[2])) {
 -                op->args[4] = tcg_swap_cond(op->args[4]);
 -            }
 -            break;
 -        case INDEX_op_setcond2_i32:
 -            if (swap_commutative2(&op->args[1], &op->args[3])) {
 -                op->args[5] = tcg_swap_cond(op->args[5]);
 -            }
 -            break;
 -        default:
 -            break;
 -        }
 -
          /* Assume all bits affected, and no bits known zero. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 --
 .25.1

-[PULL 10/16] cputlb: Partially inline memory_region_section_get_iotlb
+[PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
-There is only one caller, tlb_set_page_with_attrs.  We cannot
+This "garbage" setting pre-dates the addition of the type
-inline the entire function because the AddressSpaceDispatch
+changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
-structure is private to exec.c, and cannot easily be moved to
+and INDEX_op_extr{l,h}_i64_i32.
 include/exec/memory-internal.h.
-Compute is_ram and is_romd once within tlb_set_page_with_attrs.
+So now we have a definitive points at which to adjust z_mask
-Fold the number of tests against these predicates.  Compute
+to eliminate such bits from the 32-bit operands.
 cpu_physical_memory_is_clean outside of the tlb lock region.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/exec-all.h |  6 +---
+ tcg/optimize.c | 35 ++++++++++++++++-------------------
- accel/tcg/cputlb.c      | 68 ++++++++++++++++++++++++++---------------
+file changed, 16 insertions(+), 19 deletions(-)
  exec.c                  | 22 ++-----------
 files changed, 47 insertions(+), 49 deletions(-)
-diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/exec-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/exec-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
+@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
-                                   hwaddr *xlat, hwaddr *plen,
+         ti->is_const = true;
-                                   MemTxAttrs attrs, int *prot);
+         ti->val = ts->val;
- hwaddr memory_region_section_get_iotlb(CPUState *cpu,
+         ti->z_mask = ts->val;
--                                       MemoryRegionSection *section,
+-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
--                                       target_ulong vaddr,
+-            /* High bits of a 32-bit quantity are garbage.  */
--                                       hwaddr paddr, hwaddr xlat,
+-            ti->z_mask |= ~0xffffffffull;
--                                       int prot,
+-        }
--                                       target_ulong *address);
+     } else {
-+                                       MemoryRegionSection *section);
+         ti->is_const = false;
- #endif
+         ti->z_mask = -1;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
- /* vl.c */
+     TCGTemp *src_ts = arg_temp(src);
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+     TempOptInfo *di;
-index XXXXXXX..XXXXXXX 100644
+     TempOptInfo *si;
---- a/accel/tcg/cputlb.c
+-    uint64_t z_mask;
-+++ b/accel/tcg/cputlb.c
+     TCGOpcode new_op;
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-     MemoryRegionSection *section;
+     if (ts_are_copies(dst_ts, src_ts)) {
-     unsigned int index;
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-     target_ulong address;
+     op->args[0] = dst;
--    target_ulong code_address;
+     op->args[1] = src;
-+    target_ulong write_address;
-     uintptr_t addend;
+-    z_mask = si->z_mask;
-     CPUTLBEntry *te, tn;
+-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-     hwaddr iotlb, xlat, sz, paddr_page;
+-        /* High bits of the destination are now garbage.  */
-     target_ulong vaddr_page;
+-        z_mask |= ~0xffffffffull;
-     int asidx = cpu_asidx_from_attrs(cpu, attrs);
+-    }
-     int wp_flags;
+-    di->z_mask = z_mask;
-+    bool is_ram, is_romd;
++    di->z_mask = si->z_mask;
-     assert_cpu_is_self(cpu);
+     if (src_ts->type == dst_ts->type) {
+         TempOptInfo *ni = ts_info(si->next_copy);
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
+@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
-     if (attrs.byte_swap) {
+ static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
-         address |= TLB_BSWAP;
+                              TCGArg dst, uint64_t val)
-     }
+ {
--    if (!memory_region_is_ram(section->mr) &&
+-    /* Convert movi to mov with constant temp. */
--        !memory_region_is_romd(section->mr)) {
+-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
--        /* IO memory case */
++    TCGTemp *tv;
--        address |= TLB_MMIO;
--        addend = 0;
++    if (ctx->type == TCG_TYPE_I32) {
--    } else {
++        val = (int32_t)val;
 +
 +    is_ram = memory_region_is_ram(section->mr);
 +    is_romd = memory_region_is_romd(section->mr);
 +
 +    if (is_ram || is_romd) {
 +        /* RAM and ROMD both have associated host memory. */
          addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
 +    } else {
 +        /* I/O does not; force the host address to NULL. */
 +        addend = 0;
 +    }
 +
-+    write_address = address;
++    /* Convert movi to mov with constant temp. */
-+    if (is_ram) {
++    tv = tcg_constant_internal(ctx->type, val);
-+        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
+     init_ts_info(ctx, tv);
-+        /*
+     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
-+         * Computing is_clean is expensive; avoid all that unless
+ }
-+         * the page is actually writable.
+@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
-+         */
+     uint64_t z_mask = ctx->z_mask;
-+        if (prot & PAGE_WRITE) {
-+            if (section->readonly) {
+     /*
-+                write_address |= TLB_DISCARD_WRITE;
+-     * 32-bit ops generate 32-bit results.  For the result is zero test
-+            } else if (cpu_physical_memory_is_clean(iotlb)) {
+-     * below, we can ignore high bits, but for further optimizations we
-+                write_address |= TLB_NOTDIRTY;
+-     * need to record that the high bits contain garbage.
-+            }
++     * 32-bit ops generate 32-bit results, which for the purpose of
-+        }
++     * simplifying tcg are sign-extended.  Certainly that's how we
-+    } else {
++     * represent our constants elsewhere.  Note that the bits will
-+        /* I/O or ROMD */
++     * be reset properly for a 64-bit value when encountering the
-+        iotlb = memory_region_section_get_iotlb(cpu, section) + xlat;
++     * type changing opcodes.
-+        /*
+      */
-+         * Writes to romd devices must go through MMIO to enable write.
+     if (ctx->type == TCG_TYPE_I32) {
-+         * Reads to romd devices go through the ram_ptr found above,
+-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-+         * but of course reads to I/O must go through MMIO.
+-        a_mask &= MAKE_64BIT_MASK(0, 32);
-+         */
+-        z_mask &= MAKE_64BIT_MASK(0, 32);
-+        write_address |= TLB_MMIO;
++        a_mask = (int32_t)a_mask;
-+        if (!is_romd) {
++        z_mask = (int32_t)z_mask;
-+            address = write_address;
++        ctx->z_mask = z_mask;
 +        }
      }
--    code_address = address;
+     if (z_mask == 0) {
 -    iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
 -                                            paddr_page, xlat, prot, &address);
      wp_flags = cpu_watchpoint_address_matches(cpu, vaddr_page,
                                                TARGET_PAGE_SIZE);
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      /*
       * At this point iotlb contains a physical section number in the lower
       * TARGET_PAGE_BITS, and either
 -     *  + the ram_addr_t of the page base of the target RAM (if NOTDIRTY or ROM)
 -     *  + the offset within section->mr of the page base (otherwise)
 +     *  + the ram_addr_t of the page base of the target RAM (RAM)
 +     *  + the offset within section->mr of the page base (I/O, ROMD)
       * We subtract the vaddr_page (which is page aligned and thus won't
       * disturb the low bits) to give an offset which can be added to the
       * (non-page-aligned) vaddr of the eventual memory access to get
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      }
      if (prot & PAGE_EXEC) {
 -        tn.addr_code = code_address;
 +        tn.addr_code = address;
      } else {
          tn.addr_code = -1;
      }
      tn.addr_write = -1;
      if (prot & PAGE_WRITE) {
 -        tn.addr_write = address;
 -        if (memory_region_is_romd(section->mr)) {
 -            /* Use the MMIO path so that the device can switch states. */
 -            tn.addr_write |= TLB_MMIO;
 -        } else if (memory_region_is_ram(section->mr)) {
 -            if (section->readonly) {
 -                tn.addr_write |= TLB_DISCARD_WRITE;
 -            } else if (cpu_physical_memory_is_clean(
 -                        memory_region_get_ram_addr(section->mr) + xlat)) {
 -                tn.addr_write |= TLB_NOTDIRTY;
 -            }
 -        }
 +        tn.addr_write = write_address;
          if (prot & PAGE_WRITE_INV) {
              tn.addr_write |= TLB_INVALID_MASK;
          }
 diff --git a/exec.c b/exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/exec.c
 +++ b/exec.c
@@ -XXX,XX +XXX,XX @@ bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
  /* Called from RCU critical section */
  hwaddr memory_region_section_get_iotlb(CPUState *cpu,
 -                                       MemoryRegionSection *section,
 -                                       target_ulong vaddr,
 -                                       hwaddr paddr, hwaddr xlat,
 -                                       int prot,
 -                                       target_ulong *address)
 +                                       MemoryRegionSection *section)
  {
 -    hwaddr iotlb;
 -
 -    if (memory_region_is_ram(section->mr)) {
 -        /* Normal RAM.  */
 -        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
 -    } else {
 -        AddressSpaceDispatch *d;
 -
 -        d = flatview_to_dispatch(section->fv);
 -        iotlb = section - d->map.sections;
 -        iotlb += xlat;
 -    }
 -
 -    return iotlb;
 +    AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
 +    return section - d->map.sections;
  }
  #endif /* defined(CONFIG_USER_ONLY) */
 --
-.17.1
+.25.1

-[PULL 03/16] qemu/compiler.h: Add qemu_build_not_reached
+[PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc
-Use this as a compile-time assert that a particular
+Recognize the constant function for or-complement.
 code path is not reachable.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/qemu/compiler.h | 15 +++++++++++++++
+ tcg/optimize.c | 1 +
-file changed, 15 insertions(+)
+file changed, 1 insertion(+)
-diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/qemu/compiler.h
+--- a/tcg/optimize.c
-+++ b/include/qemu/compiler.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
- #define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__))
+ static bool fold_orc(OptContext *ctx, TCGOp *op)
- #define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__))
+ {
+     if (fold_const2(ctx, op) ||
-+/**
++        fold_xx_to_i(ctx, op, -1) ||
-+ * qemu_build_not_reached()
+         fold_xi_to_x(ctx, op, -1) ||
-+ *
+         fold_ix_to_not(ctx, op, 0)) {
-+ * The compiler, during optimization, is expected to prove that a call
+         return true;
 + * to this function cannot be reached and remove it.  If the compiler
 + * supports QEMU_ERROR, this will be reported at compile time; otherwise
 + * this will be reported at link time due to the missing symbol.
 + */
 +#ifdef __OPTIMIZE__
 +extern void QEMU_NORETURN QEMU_ERROR("code path is reachable")
 +    qemu_build_not_reached(void);
 +#else
 +#define qemu_build_not_reached()  g_assert_not_reached()
 +#endif
 +
  #endif /* COMPILER_H */
 --
-.17.1
+.25.1

-New patch
+[PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul
+Recognize the identity function for low-part multiply.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 3 ++-
+file changed, 2 insertions(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
+ static bool fold_mul(OptContext *ctx, TCGOp *op)
+ {
+     if (fold_const2(ctx, op) ||
+-        fold_xi_to_i(ctx, op, 0)) {
++        fold_xi_to_i(ctx, op, 0) ||
++        fold_xi_to_x(ctx, op, 1)) {
+         return true;
+     }
+     return false;
+--
+.25.1

-New patch
+[PULL 50/56] tcg/optimize: Use fold_xi_to_x for div
+Recognize the identity function for division.
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
+Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
+---
+ tcg/optimize.c | 6 +++++-
+file changed, 5 insertions(+), 1 deletion(-)
+diff --git a/tcg/optimize.c b/tcg/optimize.c
+index XXXXXXX..XXXXXXX 100644
+--- a/tcg/optimize.c
++++ b/tcg/optimize.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
+ static bool fold_divide(OptContext *ctx, TCGOp *op)
+ {
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
++        fold_xi_to_x(ctx, op, 1)) {
++        return true;
++    }
++    return false;
+ }
+ static bool fold_dup(OptContext *ctx, TCGOp *op)
+--
+.25.1

-[PULL 16/16] cputlb: Pass retaddr to tb_check_watchpoint
+[PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem
-Fixes the previous TLB_WATCHPOINT patches because we are currently
+Recognize the constant function for remainder.
 failing to set cpu->mem_io_pc with the call to cpu_check_watchpoint.
 Pass down the retaddr directly because it's readily available.
-Fixes: 50b107c5d61
+Suggested-by: Luis Pires <luis.pires@eldorado.org.br>
-Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: David Hildenbrand <david@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/translate-all.h | 2 +-
+ tcg/optimize.c | 6 +++++-
- accel/tcg/translate-all.c | 6 +++---
+file changed, 5 insertions(+), 1 deletion(-)
  exec.c                    | 2 +-
 files changed, 5 insertions(+), 5 deletions(-)
-diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/translate-all.h
+--- a/tcg/optimize.c
-+++ b/accel/tcg/translate-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
+@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
-                                   tb_page_addr_t start, int len,
-                                   uintptr_t retaddr);
+ static bool fold_remainder(OptContext *ctx, TCGOp *op)
  void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
 -void tb_check_watchpoint(CPUState *cpu);
 +void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
  #ifdef CONFIG_USER_ONLY
  int page_unprotect(target_ulong address, uintptr_t pc);
 diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/translate-all.c
 +++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
  #endif
  /* user-mode: call with mmap_lock held */
 -void tb_check_watchpoint(CPUState *cpu)
 +void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
  {
-     TranslationBlock *tb;
+-    return fold_const2(ctx, op);
++    if (fold_const2(ctx, op) ||
-     assert_memory_lock();
++        fold_xx_to_i(ctx, op, 0)) {
++        return true;
--    tb = tcg_tb_lookup(cpu->mem_io_pc);
++    }
-+    tb = tcg_tb_lookup(retaddr);
++    return false;
-     if (tb) {
+ }
-         /* We can use retranslation to find the PC.  */
--        cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc, true);
+ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 +        cpu_restore_state_from_tb(cpu, tb, retaddr, true);
          tb_phys_invalidate(tb, -1);
      } else {
          /* The exception probably happened in a helper.  The CPU state should
 diff --git a/exec.c b/exec.c
 index XXXXXXX..XXXXXXX 100644
 --- a/exec.c
 +++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                  cpu->watchpoint_hit = wp;
                  mmap_lock();
 -                tb_check_watchpoint(cpu);
 +                tb_check_watchpoint(cpu, ra);
                  if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                      cpu->exception_index = EXCP_DEBUG;
                      mmap_unlock();
 --
-.17.1
+.25.1

-New patch
+[PULL 52/56] tcg/optimize: Optimize sign extensions
+Certain targets, like riscv, produce signed 32-bit results.
 This can lead to lots of redundant extensions as values are
 manipulated.
 Begin by tracking only the obvious sign-extensions, and
 converting them to simple copies when possible.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
  tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 file changed, 102 insertions(+), 21 deletions(-)
 diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
 --- a/tcg/optimize.c
 +++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
      TCGTemp *next_copy;
      uint64_t val;
      uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 +    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
  } TempOptInfo;
  typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
      /* In flight values from optimization. */
      uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
      uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
 +    uint64_t s_mask;  /* mask of clrsb(value) bits */
      TCGType type;
  } OptContext;
 +/* Calculate the smask for a specific value. */
 +static uint64_t smask_from_value(uint64_t value)
 +{
 +    int rep = clrsb64(value);
 +    return ~(~0ull >> rep);
 +}
 +
 +/*
 + * Calculate the smask for a given set of known-zeros.
 + * If there are lots of zeros on the left, we can consider the remainder
 + * an unsigned field, and thus the corresponding signed field is one bit
 + * larger.
 + */
 +static uint64_t smask_from_zmask(uint64_t zmask)
 +{
 +    /*
 +     * Only the 0 bits are significant for zmask, thus the msb itself
 +     * must be zero, else we have no sign information.
 +     */
 +    int rep = clz64(zmask);
 +    if (rep == 0) {
 +        return 0;
 +    }
 +    rep -= 1;
 +    return ~(~0ull >> rep);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
      ti->prev_copy = ts;
      ti->is_const = false;
      ti->z_mask = -1;
 +    ti->s_mask = 0;
  }
  static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
          ti->is_const = true;
          ti->val = ts->val;
          ti->z_mask = ts->val;
 +        ti->s_mask = smask_from_value(ts->val);
      } else {
          ti->is_const = false;
          ti->z_mask = -1;
 +        ti->s_mask = 0;
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
      op->args[1] = src;
      di->z_mask = si->z_mask;
 +    di->s_mask = si->s_mask;
      if (src_ts->type == dst_ts->type) {
          TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
      nb_oargs = def->nb_oargs;
      for (i = 0; i < nb_oargs; i++) {
 -        reset_temp(op->args[i]);
 +        TCGTemp *ts = arg_temp(op->args[i]);
 +        reset_ts(ts);
          /*
 -         * Save the corresponding known-zero bits mask for the
 +         * Save the corresponding known-zero/sign bits mask for the
           * first output argument (only one supported so far).
           */
          if (i == 0) {
 -            arg_info(op->args[i])->z_mask = ctx->z_mask;
 +            ts_info(ts)->z_mask = ctx->z_mask;
 +            ts_info(ts)->s_mask = ctx->s_mask;
          }
      }
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
  {
      uint64_t a_mask = ctx->a_mask;
      uint64_t z_mask = ctx->z_mask;
 +    uint64_t s_mask = ctx->s_mask;
      /*
       * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
      if (ctx->type == TCG_TYPE_I32) {
          a_mask = (int32_t)a_mask;
          z_mask = (int32_t)z_mask;
 +        s_mask |= MAKE_64BIT_MASK(32, 32);
          ctx->z_mask = z_mask;
 +        ctx->s_mask = s_mask;
      }
      if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
  static bool fold_bswap(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask, sign;
 +    uint64_t z_mask, s_mask, sign;
      if (arg_is_const(op->args[1])) {
          uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      }
      z_mask = arg_info(op->args[1])->z_mask;
 +
      switch (op->opc) {
      case INDEX_op_bswap16_i32:
      case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
      default:
          g_assert_not_reached();
      }
 +    s_mask = smask_from_zmask(z_mask);
      switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
      case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
          /* If the sign bit may be 1, force all the bits above to 1. */
          if (z_mask & sign) {
              z_mask |= sign;
 +            s_mask = sign << 1;
          }
          break;
      default:
          /* The high bits are undefined: force all bits above the sign to 1. */
          z_mask |= sign << 1;
 +        s_mask = 0;
          break;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
  static bool fold_extract(OptContext *ctx, TCGOp *op)
  {
      uint64_t z_mask_old, z_mask;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = extract64(t, op->args[2], op->args[3]);
 +        t = extract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
      z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0) {
 +    z_mask = extract64(z_mask_old, pos, len);
 +    if (pos == 0) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
  static bool fold_exts(OptContext *ctx, TCGOp *op)
  {
 -    uint64_t z_mask_old, z_mask, sign;
 +    uint64_t s_mask_old, s_mask, z_mask, sign;
      bool type_change = false;
      if (fold_const1(ctx, op)) {
          return true;
      }
 -    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    s_mask = arg_info(op->args[1])->s_mask;
 +    s_mask_old = s_mask;
      switch (op->opc) {
      CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
      if (z_mask & sign) {
          z_mask |= sign;
 -    } else if (!type_change) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
      }
 +    s_mask |= sign << 1;
 +
      ctx->z_mask = z_mask;
 +    ctx->s_mask = s_mask;
 +    if (!type_change) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
      }
      ctx->z_mask = z_mask;
 +    ctx->s_mask = smask_from_zmask(z_mask);
      if (!type_change) {
          ctx->a_mask = z_mask_old ^ z_mask;
      }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
      MemOp mop = get_memop(oi);
      int width = 8 * memop_size(mop);
 -    if (!(mop & MO_SIGN) && width < 64) {
 -        ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +    if (width < 64) {
 +        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
 +        if (!(mop & MO_SIGN)) {
 +            ctx->z_mask = MAKE_64BIT_MASK(0, width);
 +            ctx->s_mask <<= 1;
 +        }
      }
      /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
  static bool fold_sextract(OptContext *ctx, TCGOp *op)
  {
 -    int64_t z_mask_old, z_mask;
 +    uint64_t z_mask, s_mask, s_mask_old;
 +    int pos = op->args[2];
 +    int len = op->args[3];
      if (arg_is_const(op->args[1])) {
          uint64_t t;
          t = arg_info(op->args[1])->val;
 -        t = sextract64(t, op->args[2], op->args[3]);
 +        t = sextract64(t, pos, len);
          return tcg_opt_gen_movi(ctx, op, op->args[0], t);
      }
 -    z_mask_old = arg_info(op->args[1])->z_mask;
 -    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
 -    if (op->args[2] == 0 && z_mask >= 0) {
 -        ctx->a_mask = z_mask_old ^ z_mask;
 -    }
 +    z_mask = arg_info(op->args[1])->z_mask;
 +    z_mask = sextract64(z_mask, pos, len);
      ctx->z_mask = z_mask;
 +    s_mask_old = arg_info(op->args[1])->s_mask;
 +    s_mask = sextract64(s_mask_old, pos, len);
 +    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
 +    ctx->s_mask = s_mask;
 +
 +    if (pos == 0) {
 +        ctx->a_mask = s_mask & ~s_mask_old;
 +    }
 +
      return fold_masks(ctx, op);
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
  {
      /* We can't do any folding with a load, but we can record bits. */
      switch (op->opc) {
 +    CASE_OP_32_64(ld8s):
 +        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
 +        break;
      CASE_OP_32_64(ld8u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 8);
 +        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
 +        break;
 +    CASE_OP_32_64(ld16s):
 +        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
          break;
      CASE_OP_32_64(ld16u):
          ctx->z_mask = MAKE_64BIT_MASK(0, 16);
 +        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
 +        break;
 +    case INDEX_op_ld32s_i64:
 +        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
          break;
      case INDEX_op_ld32u_i64:
          ctx->z_mask = MAKE_64BIT_MASK(0, 32);
 +        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
          break;
      default:
          g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
              ctx.type = TCG_TYPE_I32;
          }
 -        /* Assume all bits affected, and no bits known zero. */
 +        /* Assume all bits affected, no bits known zero, no sign reps. */
          ctx.a_mask = -1;
          ctx.z_mask = -1;
 +        ctx.s_mask = 0;
          /*
           * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
          case INDEX_op_extrh_i64_i32:
              done = fold_extu(&ctx, op);
              break;
 +        CASE_OP_32_64(ld8s):
          CASE_OP_32_64(ld8u):
 +        CASE_OP_32_64(ld16s):
          CASE_OP_32_64(ld16u):
 +        case INDEX_op_ld32s_i64:
          case INDEX_op_ld32u_i64:
              done = fold_tcg_ld(&ctx, op);
              break;
 --
 .25.1

-[PULL 08/16] cputlb: Move ROM handling from I/O path to TLB path
+[PULL 53/56] tcg/optimize: Propagate sign info for logical operations
-It does not require going through the whole I/O path
+Sign repetitions are perforce all identical, whether they are 1 or 0.
-in order to discard a write.
+Bitwise operations preserve the relative quantity of the repetitions.
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
 Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h    |  5 ++++-
+ tcg/optimize.c | 29 +++++++++++++++++++++++++++++
- include/exec/cpu-common.h |  1 -
+file changed, 29 insertions(+)
  accel/tcg/cputlb.c        | 36 ++++++++++++++++++++--------------
  exec.c                    | 41 +--------------------------------------
 files changed, 26 insertions(+), 57 deletions(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
- #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
+     z2 = arg_info(op->args[2])->z_mask;
- /* Set if TLB entry requires byte swap.  */
+     ctx->z_mask = z1 & z2;
- #define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
-+/* Set if TLB entry writes ignored.  */
++    /*
-+#define TLB_DISCARD_WRITE   (1 << (TARGET_PAGE_BITS_MIN - 6))
++     * Sign repetitions are perforce all identical, whether they are 1 or 0.
++     * Bitwise operations preserve the relative quantity of the repetitions.
- /* Use this mask to check interception with an alignment mask
++     */
-  * in a TCG backend.
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-  */
++                & arg_info(op->args[2])->s_mask;
- #define TLB_FLAGS_MASK \
++
--    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
+     /*
-+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \
+      * Known-zeros does not imply known-ones.  Therefore unless
-+    | TLB_WATCHPOINT | TLB_BSWAP | TLB_DISCARD_WRITE)
+      * arg2 is constant, we can't infer affected bits from it.
+@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
  /**
   * tlb_hit_page: return true if page aligned @addr is a hit against the
 diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/exec/cpu-common.h
 +++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
  void cpu_flush_icache_range(hwaddr start, hwaddr len);
 -extern struct MemoryRegion io_mem_rom;
  extern struct MemoryRegion io_mem_notdirty;
  typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
 diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
 index XXXXXXX..XXXXXXX 100644
 --- a/accel/tcg/cputlb.c
 +++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
  {
      uintptr_t addr = tlb_entry->addr_write;
 -    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
 +    if ((addr & (TLB_INVALID_MASK | TLB_MMIO |
 +                 TLB_DISCARD_WRITE | TLB_NOTDIRTY)) == 0) {
          addr &= TARGET_PAGE_MASK;
          addr += tlb_entry->addend;
          if ((addr - start) < length) {
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
          address |= TLB_MMIO;
          addend = 0;
      } else {
 -        /* TLB_MMIO for rom/romd handled below */
          addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
      }
+     ctx->z_mask = z1;
-@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-     tn.addr_write = -1;
++                & arg_info(op->args[2])->s_mask;
-     if (prot & PAGE_WRITE) {
+     return fold_masks(ctx, op);
--        if ((memory_region_is_ram(section->mr) && section->readonly)
+ }
--            || memory_region_is_romd(section->mr)) {
--            /* Write access calls the I/O callback.  */
+@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
--            tn.addr_write = address | TLB_MMIO;
+         fold_xi_to_not(ctx, op, 0)) {
--        } else if (memory_region_is_ram(section->mr)
+         return true;
 -                   && cpu_physical_memory_is_clean(
 -                       memory_region_get_ram_addr(section->mr) + xlat)) {
 -            tn.addr_write = address | TLB_NOTDIRTY;
 -        } else {
 -            tn.addr_write = address;
 +        tn.addr_write = address;
 +        if (memory_region_is_romd(section->mr)) {
 +            /* Use the MMIO path so that the device can switch states. */
 +            tn.addr_write |= TLB_MMIO;
 +        } else if (memory_region_is_ram(section->mr)) {
 +            if (section->readonly) {
 +                tn.addr_write |= TLB_DISCARD_WRITE;
 +            } else if (cpu_physical_memory_is_clean(
 +                        memory_region_get_ram_addr(section->mr) + xlat)) {
 +                tn.addr_write |= TLB_NOTDIRTY;
 +            }
          }
          if (prot & PAGE_WRITE_INV) {
              tn.addr_write |= TLB_INVALID_MASK;
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
      cpu->mem_io_pc = retaddr;
 -    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
 +    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
          cpu_io_recompile(cpu, retaddr);
      }
++
-@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
++                & arg_info(op->args[2])->s_mask;
-     mr = section->mr;
+     return false;
-     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+ }
--    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
-+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
+@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
-         cpu_io_recompile(cpu, retaddr);
      ctx->z_mask = arg_info(op->args[3])->z_mask
                  | arg_info(op->args[4])->z_mask;
 +    ctx->s_mask = arg_info(op->args[3])->s_mask
 +                & arg_info(op->args[4])->s_mask;
      if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
          uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, -1)) {
          return true;
      }
-     cpu->mem_io_vaddr = addr;
++
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
++    ctx->s_mask = arg_info(op->args[1])->s_mask
 +                & arg_info(op->args[2])->s_mask;
      return false;
  }
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
          fold_xi_to_not(ctx, op, 0)) {
          return true;
      }
++
-     /* Reject I/O access, or other required slow-path.  */
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
++                & arg_info(op->args[2])->s_mask;
-+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
+     return false;
-         return NULL;
+ }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
          return true;
      }
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
++    ctx->s_mask = arg_info(op->args[1])->s_mask;
              return;
          }
 +        /* Ignore writes to ROM.  */
 +        if (unlikely(tlb_addr & TLB_DISCARD_WRITE)) {
 +            return;
 +        }
 +
-         haddr = (void *)((uintptr_t)addr + entry->addend);
+     /* Because of fold_to_not, we want to always return true, via finish. */
+     finish_folding(ctx, op);
-         /*
+     return true;
-diff --git a/exec.c b/exec.c
+@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
-index XXXXXXX..XXXXXXX 100644
---- a/exec.c
+     ctx->z_mask = arg_info(op->args[1])->z_mask
-+++ b/exec.c
+                 | arg_info(op->args[2])->z_mask;
-@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
++    ctx->s_mask = arg_info(op->args[1])->s_mask
- AddressSpace address_space_io;
++                & arg_info(op->args[2])->s_mask;
- AddressSpace address_space_memory;
+     return fold_masks(ctx, op);
 -MemoryRegion io_mem_rom, io_mem_notdirty;
 +MemoryRegion io_mem_notdirty;
  static MemoryRegion io_mem_unassigned;
  #endif
@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
  #define PHYS_SECTION_UNASSIGNED 0
  #define PHYS_SECTION_NOTDIRTY 1
 -#define PHYS_SECTION_ROM 2
  static void io_mem_init(void);
  static void memory_map_init(void);
@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
          iotlb = memory_region_get_ram_addr(section->mr) + xlat;
          if (!section->readonly) {
              iotlb |= PHYS_SECTION_NOTDIRTY;
 -        } else {
 -            iotlb |= PHYS_SECTION_ROM;
          }
      } else {
          AddressSpaceDispatch *d;
@@ -XXX,XX +XXX,XX @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
      return phys_section_add(map, &section);
  }
--static void readonly_mem_write(void *opaque, hwaddr addr,
+@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
--                               uint64_t val, unsigned size)
+         fold_ix_to_not(ctx, op, 0)) {
--{
+         return true;
--    /* Ignore any write to ROM. */
+     }
--}
++
--
++    ctx->s_mask = arg_info(op->args[1])->s_mask
--static bool readonly_mem_accepts(void *opaque, hwaddr addr,
++                & arg_info(op->args[2])->s_mask;
--                                 unsigned size, bool is_write,
+     return false;
--                                 MemTxAttrs attrs)
+ }
--{
--    return is_write;
+@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
--}
--
+     ctx->z_mask = arg_info(op->args[1])->z_mask
--/* This will only be used for writes, because reads are special cased
+                 | arg_info(op->args[2])->z_mask;
-- * to directly access the underlying host ram.
++    ctx->s_mask = arg_info(op->args[1])->s_mask
-- */
++                & arg_info(op->args[2])->s_mask;
--static const MemoryRegionOps readonly_mem_ops = {
+     return fold_masks(ctx, op);
--    .write = readonly_mem_write,
+ }
 -    .valid.accepts = readonly_mem_accepts,
 -    .endianness = DEVICE_NATIVE_ENDIAN,
 -    .valid = {
 -        .min_access_size = 1,
 -        .max_access_size = 8,
 -        .unaligned = false,
 -    },
 -    .impl = {
 -        .min_access_size = 1,
 -        .max_access_size = 8,
 -        .unaligned = false,
 -    },
 -};
 -
  MemoryRegionSection *iotlb_to_section(CPUState *cpu,
                                        hwaddr index, MemTxAttrs attrs)
  {
@@ -XXX,XX +XXX,XX @@ MemoryRegionSection *iotlb_to_section(CPUState *cpu,
  static void io_mem_init(void)
  {
 -    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
 -                          NULL, NULL, UINT64_MAX);
      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                            NULL, UINT64_MAX);
@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
      assert(n == PHYS_SECTION_UNASSIGNED);
      n = dummy_section(&d->map, fv, &io_mem_notdirty);
      assert(n == PHYS_SECTION_NOTDIRTY);
 -    n = dummy_section(&d->map, fv, &io_mem_rom);
 -    assert(n == PHYS_SECTION_ROM);
      d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
 --
-.17.1
+.25.1

-[PULL 12/16] cputlb: Handle TLB_NOTDIRTY in probe_access
+[PULL 54/56] tcg/optimize: Propagate sign info for setcond
-We can use notdirty_write for the write and return a valid host
+The result is either 0 or 1, which means that we have
-pointer for this case.
+a 2 bit signed result, and thus 62 bits of sign.
 For clarity, use the smask_from_zmask function.
-Reviewed-by: David Hildenbrand <david@redhat.com>
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 26 +++++++++++++++++---------
+ tcg/optimize.c | 2 ++
-file changed, 17 insertions(+), 9 deletions(-)
+file changed, 2 insertions(+)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
          return NULL;
      }
--    /* Handle watchpoints.  */
+     ctx->z_mask = 1;
--    if (tlb_addr & TLB_WATCHPOINT) {
++    ctx->s_mask = smask_from_zmask(1);
--        cpu_check_watchpoint(env_cpu(env), addr, size,
+     return false;
--                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+ }
--                             wp_access, retaddr);
--    }
+@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 +    if (unlikely(tlb_addr & TLB_FLAGS_MASK)) {
 +        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
 -    /* Reject I/O access, or other required slow-path.  */
 -    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
 -        return NULL;
 +        /* Reject I/O access, or other required slow-path.  */
 +        if (tlb_addr & (TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
 +            return NULL;
 +        }
 +
 +        /* Handle watchpoints.  */
 +        if (tlb_addr & TLB_WATCHPOINT) {
 +            cpu_check_watchpoint(env_cpu(env), addr, size,
 +                                 iotlbentry->attrs, wp_access, retaddr);
 +        }
 +
 +        /* Handle clean RAM pages.  */
 +        if (tlb_addr & TLB_NOTDIRTY) {
 +            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
 +        }
      }
-     return (void *)((uintptr_t)addr + entry->addend);
+     ctx->z_mask = 1;
 +    ctx->s_mask = smask_from_zmask(1);
      return false;
   do_setcond_const:
 --
-.17.1
+.25.1

-[PULL 01/16] exec: Use TARGET_PAGE_BITS_MIN for TLB flags
+[PULL 55/56] tcg/optimize: Propagate sign info for bit counting
-These bits do not need to vary with the actual page size
+The results are generally 6 bit unsigned values, though
-used by the guest.
+the count leading and trailing bits may produce any value
 for a zero input.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
-Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- include/exec/cpu-all.h | 16 ++++++++++------
+ tcg/optimize.c | 3 ++-
-file changed, 10 insertions(+), 6 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/exec/cpu-all.h
+--- a/tcg/optimize.c
-+++ b/include/exec/cpu-all.h
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
+@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
+         g_assert_not_reached();
- #if !defined(CONFIG_USER_ONLY)
+     }
+     ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
--/* Flags stored in the low bits of the TLB virtual address.  These are
+-
-- * defined so that fast path ram access is all zeros.
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
-+/*
+     return false;
-+ * Flags stored in the low bits of the TLB virtual address.
+ }
-+ * These are defined so that fast path ram access is all zeros.
-  * The flags all must be between TARGET_PAGE_BITS and
+@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
-  * maximum address alignment bit.
+     default:
-+ *
+         g_assert_not_reached();
-+ * Use TARGET_PAGE_BITS_MIN so that these bits are constant
+     }
-+ * when TARGET_PAGE_BITS_VARY is in effect.
++    ctx->s_mask = smask_from_zmask(ctx->z_mask);
-  */
+     return false;
- /* Zero if TLB entry is valid.  */
+ }
--#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS - 1))
 +#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS_MIN - 1))
  /* Set if TLB entry references a clean RAM page.  The iotlb entry will
     contain the page physical address.  */
 -#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS - 2))
 +#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS_MIN - 2))
  /* Set if TLB entry is an IO callback.  */
 -#define TLB_MMIO            (1 << (TARGET_PAGE_BITS - 3))
 +#define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
  /* Set if TLB entry contains a watchpoint.  */
 -#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS - 4))
 +#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
  /* Use this mask to check interception with an alignment mask
   * in a TCG backend.
 --
-.17.1
+.25.1

-[PULL 04/16] cputlb: Use qemu_build_not_reached in load/store_helpers
+[PULL 56/56] tcg/optimize: Propagate sign info for shifting
-Increase the current runtime assert to a compile-time assert.
+For constant shifts, we can simply shift the s_mask.
-Reviewed-by: David Hildenbrand <david@redhat.com>
+For variable shifts, we know that sar does not reduce
 the s_mask, which helps for sequences like
     ext32s_i64  t, in
     sar_i64     t, t, v
     ext32s_i64  out, t
 allowing the final extend to be eliminated.
 Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
+Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
 Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
 ---
- accel/tcg/cputlb.c | 5 ++---
+ tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
-file changed, 2 insertions(+), 3 deletions(-)
+file changed, 47 insertions(+), 3 deletions(-)
-diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
+diff --git a/tcg/optimize.c b/tcg/optimize.c
 index XXXXXXX..XXXXXXX 100644
---- a/accel/tcg/cputlb.c
+--- a/tcg/optimize.c
-+++ b/accel/tcg/cputlb.c
++++ b/tcg/optimize.c
-@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
+@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
-         res = ldq_le_p(haddr);
+     return ~(~0ull >> rep);
-         break;
+ }
-     default:
--        g_assert_not_reached();
++/*
-+        qemu_build_not_reached();
++ * Recreate a properly left-aligned smask after manipulation.
 + * Some bit-shuffling, particularly shifts and rotates, may
 + * retain sign bits on the left, but may scatter disconnected
 + * sign bits on the right.  Retain only what remains to the left.
 + */
 +static uint64_t smask_from_smask(int64_t smask)
 +{
 +    /* Only the 1 bits are significant for smask */
 +    return smask_from_zmask(~smask);
 +}
 +
  static inline TempOptInfo *ts_info(TCGTemp *ts)
  {
      return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
  static bool fold_shift(OptContext *ctx, TCGOp *op)
  {
 +    uint64_t s_mask, z_mask, sign;
 +
      if (fold_const2(ctx, op) ||
          fold_ix_to_i(ctx, op, 0) ||
          fold_xi_to_x(ctx, op, 0)) {
          return true;
      }
-     return res;
++    s_mask = arg_info(op->args[1])->s_mask;
-@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
++    z_mask = arg_info(op->args[1])->z_mask;
-         stq_le_p(haddr, val);
++
-         break;
+     if (arg_is_const(op->args[2])) {
-     default:
+-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
--        g_assert_not_reached();
+-                                          arg_info(op->args[1])->z_mask,
--        break;
+-                                          arg_info(op->args[2])->val);
-+        qemu_build_not_reached();
++        int sh = arg_info(op->args[2])->val;
 +
 +        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
 +
 +        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
 +        ctx->s_mask = smask_from_smask(s_mask);
 +
          return fold_masks(ctx, op);
      }
++
++    switch (op->opc) {
++    CASE_OP_32_64(sar):
++        /*
++         * Arithmetic right shift will not reduce the number of
++         * input sign repetitions.
++         */
++        ctx->s_mask = s_mask;
++        break;
++    CASE_OP_32_64(shr):
++        /*
++         * If the sign bit is known zero, then logical right shift
++         * will not reduced the number of input sign repetitions.
++         */
++        sign = (s_mask & -s_mask) >> 1;
++        if (!(z_mask & sign)) {
++            ctx->s_mask = s_mask;
++        }
++        break;
++    default:
++        break;
++    }
++
+     return false;
  }
 --
-.17.1
+.25.1

This is v4 of my notdirty + rom patch set with two suggested name
changes (qemu_build_not_reached, TLB_DISCARD_WRITE) from David and Alex.

The following changes since commit 240ab11fb72049d6373cbbec8d788f8e411a00bc:

Merge remote-tracking branch 'remotes/aperard/tags/pull-xen-20190924' into staging (2019-09-24 15:36:31 +0100)

are available in the Git repository at:

https://github.com/rth7680/qemu.git tags/pull-tcg-20190925

for you to fetch changes up to ae57db63acf5a0399232f852acc5c1d83ef63400:

cputlb: Pass retaddr to tb_check_watchpoint (2019-09-25 10:56:28 -0700)

----------------------------------------------------------------
Fixes for TLB_BSWAP
Coversion of NOTDIRTY and ROM handling to cputlb
Followup cleanups to cputlb

----------------------------------------------------------------
Richard Henderson (16):
      exec: Use TARGET_PAGE_BITS_MIN for TLB flags
      cputlb: Disable __always_inline__ without optimization
      qemu/compiler.h: Add qemu_build_not_reached
      cputlb: Use qemu_build_not_reached in load/store_helpers
      cputlb: Split out load/store_memop
      cputlb: Introduce TLB_BSWAP
      exec: Adjust notdirty tracing
      cputlb: Move ROM handling from I/O path to TLB path
      cputlb: Move NOTDIRTY handling from I/O path to TLB path
      cputlb: Partially inline memory_region_section_get_iotlb
      cputlb: Merge and move memory_notdirty_write_{prepare,complete}
      cputlb: Handle TLB_NOTDIRTY in probe_access
      cputlb: Remove cpu->mem_io_vaddr
      cputlb: Remove tb_invalidate_phys_page_range is_cpu_write_access
      cputlb: Pass retaddr to tb_invalidate_phys_page_fast
      cputlb: Pass retaddr to tb_check_watchpoint

These bits do not need to vary with the actual page size
used by the guest.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
 
 #if !defined(CONFIG_USER_ONLY)
 
-/* Flags stored in the low bits of the TLB virtual address.  These are
- * defined so that fast path ram access is all zeros.
+/*
+ * Flags stored in the low bits of the TLB virtual address.
+ * These are defined so that fast path ram access is all zeros.
  * The flags all must be between TARGET_PAGE_BITS and
  * maximum address alignment bit.
+ *
+ * Use TARGET_PAGE_BITS_MIN so that these bits are constant
+ * when TARGET_PAGE_BITS_VARY is in effect.
  */
 /* Zero if TLB entry is valid.  */
-#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS - 1))
+#define TLB_INVALID_MASK    (1 << (TARGET_PAGE_BITS_MIN - 1))
 /* Set if TLB entry references a clean RAM page.  The iotlb entry will
    contain the page physical address.  */
-#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS - 2))
+#define TLB_NOTDIRTY        (1 << (TARGET_PAGE_BITS_MIN - 2))
 /* Set if TLB entry is an IO callback.  */
-#define TLB_MMIO            (1 << (TARGET_PAGE_BITS - 3))
+#define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
 /* Set if TLB entry contains a watchpoint.  */
-#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS - 4))
+#define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
 
 /* Use this mask to check interception with an alignment mask
  * in a TCG backend.
-- 
2.17.1

This forced inlining can result in missing symbols,
which makes a debugging build harder to follow.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/compiler.h | 11 +++++++++++
 accel/tcg/cputlb.c      |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -XXX,XX +XXX,XX @@
 # define QEMU_NONSTRING
 #endif
 
+/*
+ * Forced inlining may be desired to encourage constant propagation
+ * of function parameters.  However, it can also make debugging harder,
+ * so disable it for a non-optimizing build.
+ */
+#if defined(__OPTIMIZE__)
+#define QEMU_ALWAYS_INLINE  __attribute__((always_inline))
+#else
+#define QEMU_ALWAYS_INLINE
+#endif
+
 /* Implement C11 _Generic via GCC builtins.  Example:
  *
  *    QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x)
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr);
 
-static inline uint64_t __attribute__((always_inline))
+static inline uint64_t QEMU_ALWAYS_INLINE
 load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
             uintptr_t retaddr, MemOp op, bool code_read,
             FullLoadHelper *full_load)
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
  * Store Helpers
  */
 
-static inline void __attribute__((always_inline))
+static inline void QEMU_ALWAYS_INLINE
 store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
 {
-- 
2.17.1

Use this as a compile-time assert that a particular
code path is not reachable.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/compiler.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -XXX,XX +XXX,XX @@
 #define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__))
 #define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__))
 
+/**
+ * qemu_build_not_reached()
+ *
+ * The compiler, during optimization, is expected to prove that a call
+ * to this function cannot be reached and remove it.  If the compiler
+ * supports QEMU_ERROR, this will be reported at compile time; otherwise
+ * this will be reported at link time due to the missing symbol.
+ */
+#ifdef __OPTIMIZE__
+extern void QEMU_NORETURN QEMU_ERROR("code path is reachable")
+    qemu_build_not_reached(void);
+#else
+#define qemu_build_not_reached()  g_assert_not_reached()
+#endif
+
 #endif /* COMPILER_H */
-- 
2.17.1

We will shortly be using these more than once.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 107 +++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
                                 TCGMemOpIdx oi, uintptr_t retaddr);
 
+static inline uint64_t QEMU_ALWAYS_INLINE
+load_memop(const void *haddr, MemOp op)
+{
+    switch (op) {
+    case MO_UB:
+        return ldub_p(haddr);
+    case MO_BEUW:
+        return lduw_be_p(haddr);
+    case MO_LEUW:
+        return lduw_le_p(haddr);
+    case MO_BEUL:
+        return (uint32_t)ldl_be_p(haddr);
+    case MO_LEUL:
+        return (uint32_t)ldl_le_p(haddr);
+    case MO_BEQ:
+        return ldq_be_p(haddr);
+    case MO_LEQ:
+        return ldq_le_p(haddr);
+    default:
+        qemu_build_not_reached();
+    }
+}
+
 static inline uint64_t QEMU_ALWAYS_INLINE
 load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
             uintptr_t retaddr, MemOp op, bool code_read,
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
 
  do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
-    switch (op) {
-    case MO_UB:
-        res = ldub_p(haddr);
-        break;
-    case MO_BEUW:
-        res = lduw_be_p(haddr);
-        break;
-    case MO_LEUW:
-        res = lduw_le_p(haddr);
-        break;
-    case MO_BEUL:
-        res = (uint32_t)ldl_be_p(haddr);
-        break;
-    case MO_LEUL:
-        res = (uint32_t)ldl_le_p(haddr);
-        break;
-    case MO_BEQ:
-        res = ldq_be_p(haddr);
-        break;
-    case MO_LEQ:
-        res = ldq_le_p(haddr);
-        break;
-    default:
-        qemu_build_not_reached();
-    }
-
-    return res;
+    return load_memop(haddr, op);
 }
 
 /*
@@ -XXX,XX +XXX,XX @@ tcg_target_ulong helper_be_ldsl_mmu(CPUArchState *env, target_ulong addr,
  * Store Helpers
  */
 
+static inline void QEMU_ALWAYS_INLINE
+store_memop(void *haddr, uint64_t val, MemOp op)
+{
+    switch (op) {
+    case MO_UB:
+        stb_p(haddr, val);
+        break;
+    case MO_BEUW:
+        stw_be_p(haddr, val);
+        break;
+    case MO_LEUW:
+        stw_le_p(haddr, val);
+        break;
+    case MO_BEUL:
+        stl_be_p(haddr, val);
+        break;
+    case MO_LEUL:
+        stl_le_p(haddr, val);
+        break;
+    case MO_BEQ:
+        stq_be_p(haddr, val);
+        break;
+    case MO_LEQ:
+        stq_le_p(haddr, val);
+        break;
+    default:
+        qemu_build_not_reached();
+    }
+}
+
 static inline void QEMU_ALWAYS_INLINE
 store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
  do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
-    switch (op) {
-    case MO_UB:
-        stb_p(haddr, val);
-        break;
-    case MO_BEUW:
-        stw_be_p(haddr, val);
-        break;
-    case MO_LEUW:
-        stw_le_p(haddr, val);
-        break;
-    case MO_BEUL:
-        stl_be_p(haddr, val);
-        break;
-    case MO_LEUL:
-        stl_le_p(haddr, val);
-        break;
-    case MO_BEQ:
-        stq_be_p(haddr, val);
-        break;
-    case MO_LEQ:
-        stq_le_p(haddr, val);
-        break;
-    default:
-        qemu_build_not_reached();
-    }
+    store_memop(haddr, val, op);
 }
 
 void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-- 
2.17.1

Handle bswap on ram directly in load/store_helper.  This fixes a
bug with the previous implementation in that one cannot use the
I/O path for RAM.

Fixes: a26fc6f5152b47f1
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h |  4 ++-
 accel/tcg/cputlb.c     | 72 +++++++++++++++++++++++++-----------------
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
 #define TLB_MMIO            (1 << (TARGET_PAGE_BITS_MIN - 3))
 /* Set if TLB entry contains a watchpoint.  */
 #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
+/* Set if TLB entry requires byte swap.  */
+#define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
 
 /* Use this mask to check interception with an alignment mask
  * in a TCG backend.
  */
 #define TLB_FLAGS_MASK \
-    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT)
+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
 
 /**
  * tlb_hit_page: return true if page aligned @addr is a hit against the
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
         address |= TLB_INVALID_MASK;
     }
     if (attrs.byte_swap) {
-        /* Force the access through the I/O slow path.  */
-        address |= TLB_MMIO;
+        address |= TLB_BSWAP;
     }
     if (!memory_region_is_ram(section->mr) &&
         !memory_region_is_romd(section->mr)) {
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     bool locked = false;
     MemTxResult r;
 
-    if (iotlbentry->attrs.byte_swap) {
-        op ^= MO_BSWAP;
-    }
-
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     bool locked = false;
     MemTxResult r;
 
-    if (iotlbentry->attrs.byte_swap) {
-        op ^= MO_BSWAP;
-    }
-
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
                              wp_access, retaddr);
     }
 
-    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO)) {
-        /* I/O access */
+    /* Reject I/O access, or other required slow-path.  */
+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
         return NULL;
     }
 
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
     /* Handle anything that isn't just a straight memory access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
         CPUIOTLBEntry *iotlbentry;
+        bool need_swap;
 
         /* For anything that is unaligned, recurse through full_load.  */
         if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
             /* On watchpoint hit, this will longjmp out.  */
             cpu_check_watchpoint(env_cpu(env), addr, size,
                                  iotlbentry->attrs, BP_MEM_READ, retaddr);
-
-            /* The backing page may or may not require I/O.  */
-            tlb_addr &= ~TLB_WATCHPOINT;
-            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
-                goto do_aligned_access;
-            }
         }
 
+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+
         /* Handle I/O access.  */
-        return io_readx(env, iotlbentry, mmu_idx, addr,
-                        retaddr, access_type, op);
+        if (likely(tlb_addr & TLB_MMIO)) {
+            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
+                            access_type, op ^ (need_swap * MO_BSWAP));
+        }
+
+        haddr = (void *)((uintptr_t)addr + entry->addend);
+
+        /*
+         * Keep these two load_memop separate to ensure that the compiler
+         * is able to fold the entire function to a single instruction.
+         * There is a build-time assert inside to remind you of this.  ;-)
+         */
+        if (unlikely(need_swap)) {
+            return load_memop(haddr, op ^ MO_BSWAP);
+        }
+        return load_memop(haddr, op);
     }
 
     /* Handle slow unaligned access (it spans two pages or IO).  */
@@ -XXX,XX +XXX,XX @@ load_helper(CPUArchState *env, target_ulong addr, TCGMemOpIdx oi,
         return res & MAKE_64BIT_MASK(0, size * 8);
     }
 
- do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
     return load_memop(haddr, op);
 }
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
     /* Handle anything that isn't just a straight memory access.  */
     if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
         CPUIOTLBEntry *iotlbentry;
+        bool need_swap;
 
         /* For anything that is unaligned, recurse through byte stores.  */
         if ((addr & (size - 1)) != 0) {
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             /* On watchpoint hit, this will longjmp out.  */
             cpu_check_watchpoint(env_cpu(env), addr, size,
                                  iotlbentry->attrs, BP_MEM_WRITE, retaddr);
-
-            /* The backing page may or may not require I/O.  */
-            tlb_addr &= ~TLB_WATCHPOINT;
-            if ((tlb_addr & ~TARGET_PAGE_MASK) == 0) {
-                goto do_aligned_access;
-            }
         }
 
+        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
+
         /* Handle I/O access.  */
-        io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr, op);
+        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
+            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
+                      op ^ (need_swap * MO_BSWAP));
+            return;
+        }
+
+        haddr = (void *)((uintptr_t)addr + entry->addend);
+
+        /*
+         * Keep these two store_memop separate to ensure that the compiler
+         * is able to fold the entire function to a single instruction.
+         * There is a build-time assert inside to remind you of this.  ;-)
+         */
+        if (unlikely(need_swap)) {
+            store_memop(haddr, val, op ^ MO_BSWAP);
+        } else {
+            store_memop(haddr, val, op);
+        }
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
         return;
     }
 
- do_aligned_access:
     haddr = (void *)((uintptr_t)addr + entry->addend);
     store_memop(haddr, val, op);
 }
-- 
2.17.1

The memory_region_tb_read tracepoint is unreachable, since notdirty
is supposed to apply only to writes.  The memory_region_tb_write
tracepoint is mis-named, because notdirty is not only used for TB
invalidation.  It is also used for e.g. VGA RAM updates and migration.

Replace memory_region_tb_write with memory_notdirty_write_access,
and place it in memory_notdirty_write_prepare where it can catch
all of the instances.  Add memory_notdirty_set_dirty to log when
we no longer intercept writes to a page.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 exec.c       | 3 +++
 memory.c     | 4 ----
 trace-events | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
     ndi->size = size;
     ndi->pages = NULL;
 
+    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
+
     assert(tcg_enabled());
     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
         ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
     /* we remove the notdirty callback only if the code has been
        flushed */
     if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
+        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
         tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
     }
 }
diff --git a/memory.c b/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/memory.c
+++ b/memory.c
@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_read(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
         /* Accesses to code which has previously been translated into a TB show
          * up in the MMIO path, as accesses to the io_mem_notdirty
          * MemoryRegion. */
-        trace_memory_region_tb_write(get_cpu_index(), addr, tmp, size);
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
diff --git a/trace-events b/trace-events
index XXXXXXX..XXXXXXX 100644
--- a/trace-events
+++ b/trace-events
@@ -XXX,XX +XXX,XX @@ dma_map_wait(void *dbs) "dbs=%p"
 find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64
 find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
 ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
+memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
+memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
 
 # memory.c
 memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ops_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_subpage_read(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_subpage_write(int cpu_index, void *mr, uint64_t offset, uint64_t value, unsigned size) "cpu %d mr %p offset 0x%"PRIx64" value 0x%"PRIx64" size %u"
-memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
-memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
 flatview_new(void *view, void *root) "%p (root %p)"
-- 
2.17.1

It does not require going through the whole I/O path
in order to discard a write.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h    |  5 ++++-
 include/exec/cpu-common.h |  1 -
 accel/tcg/cputlb.c        | 36 ++++++++++++++++++++--------------
 exec.c                    | 41 +--------------------------------------
 4 files changed, 26 insertions(+), 57 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -XXX,XX +XXX,XX @@ CPUArchState *cpu_copy(CPUArchState *env);
 #define TLB_WATCHPOINT      (1 << (TARGET_PAGE_BITS_MIN - 4))
 /* Set if TLB entry requires byte swap.  */
 #define TLB_BSWAP           (1 << (TARGET_PAGE_BITS_MIN - 5))
+/* Set if TLB entry writes ignored.  */
+#define TLB_DISCARD_WRITE   (1 << (TARGET_PAGE_BITS_MIN - 6))
 
 /* Use this mask to check interception with an alignment mask
  * in a TCG backend.
  */
 #define TLB_FLAGS_MASK \
-    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO | TLB_WATCHPOINT | TLB_BSWAP)
+    (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \
+    | TLB_WATCHPOINT | TLB_BSWAP | TLB_DISCARD_WRITE)
 
 /**
  * tlb_hit_page: return true if page aligned @addr is a hit against the
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
 
 void cpu_flush_icache_range(hwaddr start, hwaddr len);
 
-extern struct MemoryRegion io_mem_rom;
 extern struct MemoryRegion io_mem_notdirty;
 
 typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
 {
     uintptr_t addr = tlb_entry->addr_write;
 
-    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO |
+                 TLB_DISCARD_WRITE | TLB_NOTDIRTY)) == 0) {
         addr &= TARGET_PAGE_MASK;
         addr += tlb_entry->addend;
         if ((addr - start) < length) {
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
         address |= TLB_MMIO;
         addend = 0;
     } else {
-        /* TLB_MMIO for rom/romd handled below */
         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
     }
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 
     tn.addr_write = -1;
     if (prot & PAGE_WRITE) {
-        if ((memory_region_is_ram(section->mr) && section->readonly)
-            || memory_region_is_romd(section->mr)) {
-            /* Write access calls the I/O callback.  */
-            tn.addr_write = address | TLB_MMIO;
-        } else if (memory_region_is_ram(section->mr)
-                   && cpu_physical_memory_is_clean(
-                       memory_region_get_ram_addr(section->mr) + xlat)) {
-            tn.addr_write = address | TLB_NOTDIRTY;
-        } else {
-            tn.addr_write = address;
+        tn.addr_write = address;
+        if (memory_region_is_romd(section->mr)) {
+            /* Use the MMIO path so that the device can switch states. */
+            tn.addr_write |= TLB_MMIO;
+        } else if (memory_region_is_ram(section->mr)) {
+            if (section->readonly) {
+                tn.addr_write |= TLB_DISCARD_WRITE;
+            } else if (cpu_physical_memory_is_clean(
+                        memory_region_get_ram_addr(section->mr) + xlat)) {
+                tn.addr_write |= TLB_NOTDIRTY;
+            }
         }
         if (prot & PAGE_WRITE_INV) {
             tn.addr_write |= TLB_INVALID_MASK;
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
 
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
     cpu->mem_io_vaddr = addr;
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
     }
 
     /* Reject I/O access, or other required slow-path.  */
-    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP)) {
+    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
         return NULL;
     }
 
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             return;
         }
 
+        /* Ignore writes to ROM.  */
+        if (unlikely(tlb_addr & TLB_DISCARD_WRITE)) {
+            return;
+        }
+
         haddr = (void *)((uintptr_t)addr + entry->addend);
 
         /*
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
 AddressSpace address_space_io;
 AddressSpace address_space_memory;
 
-MemoryRegion io_mem_rom, io_mem_notdirty;
+MemoryRegion io_mem_notdirty;
 static MemoryRegion io_mem_unassigned;
 #endif
 
@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
 
 #define PHYS_SECTION_UNASSIGNED 0
 #define PHYS_SECTION_NOTDIRTY 1
-#define PHYS_SECTION_ROM 2
 
 static void io_mem_init(void);
 static void memory_map_init(void);
@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
         if (!section->readonly) {
             iotlb |= PHYS_SECTION_NOTDIRTY;
-        } else {
-            iotlb |= PHYS_SECTION_ROM;
         }
     } else {
         AddressSpaceDispatch *d;
@@ -XXX,XX +XXX,XX @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
     return phys_section_add(map, &section);
 }
 
-static void readonly_mem_write(void *opaque, hwaddr addr,
-                               uint64_t val, unsigned size)
-{
-    /* Ignore any write to ROM. */
-}
-
-static bool readonly_mem_accepts(void *opaque, hwaddr addr,
-                                 unsigned size, bool is_write,
-                                 MemTxAttrs attrs)
-{
-    return is_write;
-}
-
-/* This will only be used for writes, because reads are special cased
- * to directly access the underlying host ram.
- */
-static const MemoryRegionOps readonly_mem_ops = {
-    .write = readonly_mem_write,
-    .valid.accepts = readonly_mem_accepts,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-    .impl = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-};
-
 MemoryRegionSection *iotlb_to_section(CPUState *cpu,
                                       hwaddr index, MemTxAttrs attrs)
 {
@@ -XXX,XX +XXX,XX @@ MemoryRegionSection *iotlb_to_section(CPUState *cpu,
 
 static void io_mem_init(void)
 {
-    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
-                          NULL, NULL, UINT64_MAX);
     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                           NULL, UINT64_MAX);
 
@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
     assert(n == PHYS_SECTION_UNASSIGNED);
     n = dummy_section(&d->map, fv, &io_mem_notdirty);
     assert(n == PHYS_SECTION_NOTDIRTY);
-    n = dummy_section(&d->map, fv, &io_mem_rom);
-    assert(n == PHYS_SECTION_ROM);
 
     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
 
-- 
2.17.1

Pages that we want to track for NOTDIRTY are RAM.  We do not
really need to go through the I/O path to handle them.

Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-common.h |  2 --
 accel/tcg/cputlb.c        | 26 +++++++++++++++++---
 exec.c                    | 50 ---------------------------------------
 memory.c                  | 16 -------------
 4 files changed, 23 insertions(+), 71 deletions(-)

diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -XXX,XX +XXX,XX @@ void qemu_flush_coalesced_mmio_buffer(void);
 
 void cpu_flush_icache_range(hwaddr start, hwaddr len);
 
-extern struct MemoryRegion io_mem_notdirty;
-
 typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
 
 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
     cpu->mem_io_pc = retaddr;
-    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
 
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
     mr = section->mr;
     mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-    if (mr != &io_mem_notdirty && !cpu->can_do_io) {
+    if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
     cpu->mem_io_vaddr = addr;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
         need_swap = size > 1 && (tlb_addr & TLB_BSWAP);
 
         /* Handle I/O access.  */
-        if (likely(tlb_addr & (TLB_MMIO | TLB_NOTDIRTY))) {
+        if (tlb_addr & TLB_MMIO) {
             io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
                       op ^ (need_swap * MO_BSWAP));
             return;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
         haddr = (void *)((uintptr_t)addr + entry->addend);
 
+        /* Handle clean RAM pages.  */
+        if (tlb_addr & TLB_NOTDIRTY) {
+            NotDirtyInfo ndi;
+
+            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
+            env_cpu(env)->mem_io_pc = retaddr;
+
+            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
+                                          addr + iotlbentry->addr, size);
+
+            if (unlikely(need_swap)) {
+                store_memop(haddr, val, op ^ MO_BSWAP);
+            } else {
+                store_memop(haddr, val, op);
+            }
+
+            memory_notdirty_write_complete(&ndi);
+            return;
+        }
+
         /*
          * Keep these two store_memop separate to ensure that the compiler
          * is able to fold the entire function to a single instruction.
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ static MemoryRegion *system_io;
 AddressSpace address_space_io;
 AddressSpace address_space_memory;
 
-MemoryRegion io_mem_notdirty;
 static MemoryRegion io_mem_unassigned;
 #endif
 
@@ -XXX,XX +XXX,XX @@ typedef struct subpage_t {
 } subpage_t;
 
 #define PHYS_SECTION_UNASSIGNED 0
-#define PHYS_SECTION_NOTDIRTY 1
 
 static void io_mem_init(void);
 static void memory_map_init(void);
@@ -XXX,XX +XXX,XX @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
     if (memory_region_is_ram(section->mr)) {
         /* Normal RAM.  */
         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
-        if (!section->readonly) {
-            iotlb |= PHYS_SECTION_NOTDIRTY;
-        }
     } else {
         AddressSpaceDispatch *d;
 
@@ -XXX,XX +XXX,XX @@ void memory_notdirty_write_complete(NotDirtyInfo *ndi)
     }
 }
 
-/* Called within RCU critical section.  */
-static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
-                               uint64_t val, unsigned size)
-{
-    NotDirtyInfo ndi;
-
-    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
-                         ram_addr, size);
-
-    stn_p(qemu_map_ram_ptr(NULL, ram_addr), size, val);
-    memory_notdirty_write_complete(&ndi);
-}
-
-static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
-                                 unsigned size, bool is_write,
-                                 MemTxAttrs attrs)
-{
-    return is_write;
-}
-
-static const MemoryRegionOps notdirty_mem_ops = {
-    .write = notdirty_mem_write,
-    .valid.accepts = notdirty_mem_accepts,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-    .impl = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-};
-
 /* Generate a debug exception if a watchpoint has been hit.  */
 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                           MemTxAttrs attrs, int flags, uintptr_t ra)
@@ -XXX,XX +XXX,XX @@ static void io_mem_init(void)
 {
     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                           NULL, UINT64_MAX);
-
-    /* io_mem_notdirty calls tb_invalidate_phys_page_fast,
-     * which can be called without the iothread mutex.
-     */
-    memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
-                          NULL, UINT64_MAX);
-    memory_region_clear_global_locking(&io_mem_notdirty);
 }
 
 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
@@ -XXX,XX +XXX,XX @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
 
     n = dummy_section(&d->map, fv, &io_mem_unassigned);
     assert(n == PHYS_SECTION_UNASSIGNED);
-    n = dummy_section(&d->map, fv, &io_mem_notdirty);
-    assert(n == PHYS_SECTION_NOTDIRTY);
 
     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
 
diff --git a/memory.c b/memory.c
index XXXXXXX..XXXXXXX 100644
--- a/memory.c
+++ b/memory.c
@@ -XXX,XX +XXX,XX @@ static MemTxResult  memory_region_read_accessor(MemoryRegion *mr,
     tmp = mr->ops->read(mr->opaque, addr, size);
     if (mr->subpage) {
         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr,
     r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs);
     if (mr->subpage) {
         trace_memory_region_subpage_read(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_READ_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_read(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr,
 
     if (mr->subpage) {
         trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
@@ -XXX,XX +XXX,XX @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr,
 
     if (mr->subpage) {
         trace_memory_region_subpage_write(get_cpu_index(), mr, addr, tmp, size);
-    } else if (mr == &io_mem_notdirty) {
-        /* Accesses to code which has previously been translated into a TB show
-         * up in the MMIO path, as accesses to the io_mem_notdirty
-         * MemoryRegion. */
     } else if (TRACE_MEMORY_REGION_OPS_WRITE_ENABLED) {
         hwaddr abs_addr = memory_region_to_absolute_addr(mr, addr);
         trace_memory_region_ops_write(get_cpu_index(), mr, abs_addr, tmp, size);
-- 
2.17.1

There is only one caller, tlb_set_page_with_attrs.  We cannot
inline the entire function because the AddressSpaceDispatch
structure is private to exec.c, and cannot easily be moved to
include/exec/memory-internal.h.

Compute is_ram and is_romd once within tlb_set_page_with_attrs.
Fold the number of tests against these predicates.  Compute
cpu_physical_memory_is_clean outside of the tlb lock region.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/exec-all.h |  6 +---
 accel/tcg/cputlb.c      | 68 ++++++++++++++++++++++++++---------------
 exec.c                  | 22 ++-----------
 3 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -XXX,XX +XXX,XX @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
                                   hwaddr *xlat, hwaddr *plen,
                                   MemTxAttrs attrs, int *prot);
 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
-                                       MemoryRegionSection *section,
-                                       target_ulong vaddr,
-                                       hwaddr paddr, hwaddr xlat,
-                                       int prot,
-                                       target_ulong *address);
+                                       MemoryRegionSection *section);
 #endif
 
 /* vl.c */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     MemoryRegionSection *section;
     unsigned int index;
     target_ulong address;
-    target_ulong code_address;
+    target_ulong write_address;
     uintptr_t addend;
     CPUTLBEntry *te, tn;
     hwaddr iotlb, xlat, sz, paddr_page;
     target_ulong vaddr_page;
     int asidx = cpu_asidx_from_attrs(cpu, attrs);
     int wp_flags;
+    bool is_ram, is_romd;
 
     assert_cpu_is_self(cpu);
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     if (attrs.byte_swap) {
         address |= TLB_BSWAP;
     }
-    if (!memory_region_is_ram(section->mr) &&
-        !memory_region_is_romd(section->mr)) {
-        /* IO memory case */
-        address |= TLB_MMIO;
-        addend = 0;
-    } else {
+
+    is_ram = memory_region_is_ram(section->mr);
+    is_romd = memory_region_is_romd(section->mr);
+
+    if (is_ram || is_romd) {
+        /* RAM and ROMD both have associated host memory. */
         addend = (uintptr_t)memory_region_get_ram_ptr(section->mr) + xlat;
+    } else {
+        /* I/O does not; force the host address to NULL. */
+        addend = 0;
+    }
+
+    write_address = address;
+    if (is_ram) {
+        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
+        /*
+         * Computing is_clean is expensive; avoid all that unless
+         * the page is actually writable.
+         */
+        if (prot & PAGE_WRITE) {
+            if (section->readonly) {
+                write_address |= TLB_DISCARD_WRITE;
+            } else if (cpu_physical_memory_is_clean(iotlb)) {
+                write_address |= TLB_NOTDIRTY;
+            }
+        }
+    } else {
+        /* I/O or ROMD */
+        iotlb = memory_region_section_get_iotlb(cpu, section) + xlat;
+        /*
+         * Writes to romd devices must go through MMIO to enable write.
+         * Reads to romd devices go through the ram_ptr found above,
+         * but of course reads to I/O must go through MMIO.
+         */
+        write_address |= TLB_MMIO;
+        if (!is_romd) {
+            address = write_address;
+        }
     }
 
-    code_address = address;
-    iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
-                                            paddr_page, xlat, prot, &address);
     wp_flags = cpu_watchpoint_address_matches(cpu, vaddr_page,
                                               TARGET_PAGE_SIZE);
 
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     /*
      * At this point iotlb contains a physical section number in the lower
      * TARGET_PAGE_BITS, and either
-     *  + the ram_addr_t of the page base of the target RAM (if NOTDIRTY or ROM)
-     *  + the offset within section->mr of the page base (otherwise)
+     *  + the ram_addr_t of the page base of the target RAM (RAM)
+     *  + the offset within section->mr of the page base (I/O, ROMD)
      * We subtract the vaddr_page (which is page aligned and thus won't
      * disturb the low bits) to give an offset which can be added to the
      * (non-page-aligned) vaddr of the eventual memory access to get
@@ -XXX,XX +XXX,XX @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     }
 
     if (prot & PAGE_EXEC) {
-        tn.addr_code = code_address;
+        tn.addr_code = address;
     } else {
         tn.addr_code = -1;
     }
 
     tn.addr_write = -1;
     if (prot & PAGE_WRITE) {
-        tn.addr_write = address;
-        if (memory_region_is_romd(section->mr)) {
-            /* Use the MMIO path so that the device can switch states. */
-            tn.addr_write |= TLB_MMIO;
-        } else if (memory_region_is_ram(section->mr)) {
-            if (section->readonly) {
-                tn.addr_write |= TLB_DISCARD_WRITE;
-            } else if (cpu_physical_memory_is_clean(
-                        memory_region_get_ram_addr(section->mr) + xlat)) {
-                tn.addr_write |= TLB_NOTDIRTY;
-            }
-        }
+        tn.addr_write = write_address;
         if (prot & PAGE_WRITE_INV) {
             tn.addr_write |= TLB_INVALID_MASK;
         }
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
 
 /* Called from RCU critical section */
 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
-                                       MemoryRegionSection *section,
-                                       target_ulong vaddr,
-                                       hwaddr paddr, hwaddr xlat,
-                                       int prot,
-                                       target_ulong *address)
+                                       MemoryRegionSection *section)
 {
-    hwaddr iotlb;
-
-    if (memory_region_is_ram(section->mr)) {
-        /* Normal RAM.  */
-        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
-    } else {
-        AddressSpaceDispatch *d;
-
-        d = flatview_to_dispatch(section->fv);
-        iotlb = section - d->map.sections;
-        iotlb += xlat;
-    }
-
-    return iotlb;
+    AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
+    return section - d->map.sections;
 }
 #endif /* defined(CONFIG_USER_ONLY) */
 
-- 
2.17.1

Since 9458a9a1df1a, all readers of the dirty bitmaps wait
for the rcu lock, which means that they wait until the end
of any executing TranslationBlock.

As a consequence, there is no need for the actual access
to happen in between the _prepare and _complete.  Therefore,
we can improve things by merging the two functions into
notdirty_write and dropping the NotDirtyInfo structure.

In addition, the only users of notdirty_write are in cputlb.c,
so move the merged function there.  Pass in the CPUIOTLBEntry
from which the ram_addr_t may be computed.

Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/memory-internal.h | 65 -----------------------------
 accel/tcg/cputlb.c             | 76 +++++++++++++++++++---------------
 exec.c                         | 44 --------------------
 3 files changed, 42 insertions(+), 143 deletions(-)

diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h
index XXXXXXX..XXXXXXX 100644
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@@ -XXX,XX +XXX,XX @@ void address_space_dispatch_free(AddressSpaceDispatch *d);
 
 void mtree_print_dispatch(struct AddressSpaceDispatch *d,
                           MemoryRegion *root);
-
-struct page_collection;
-
-/* Opaque struct for passing info from memory_notdirty_write_prepare()
- * to memory_notdirty_write_complete(). Callers should treat all fields
- * as private, with the exception of @active.
- *
- * @active is a field which is not touched by either the prepare or
- * complete functions, but which the caller can use if it wishes to
- * track whether it has called prepare for this struct and so needs
- * to later call the complete function.
- */
-typedef struct {
-    CPUState *cpu;
-    struct page_collection *pages;
-    ram_addr_t ram_addr;
-    vaddr mem_vaddr;
-    unsigned size;
-    bool active;
-} NotDirtyInfo;
-
-/**
- * memory_notdirty_write_prepare: call before writing to non-dirty memory
- * @ndi: pointer to opaque NotDirtyInfo struct
- * @cpu: CPU doing the write
- * @mem_vaddr: virtual address of write
- * @ram_addr: the ram address of the write
- * @size: size of write in bytes
- *
- * Any code which writes to the host memory corresponding to
- * guest RAM which has been marked as NOTDIRTY must wrap those
- * writes in calls to memory_notdirty_write_prepare() and
- * memory_notdirty_write_complete():
- *
- *  NotDirtyInfo ndi;
- *  memory_notdirty_write_prepare(&ndi, ....);
- *  ... perform write here ...
- *  memory_notdirty_write_complete(&ndi);
- *
- * These calls will ensure that we flush any TCG translated code for
- * the memory being written, update the dirty bits and (if possible)
- * remove the slowpath callback for writing to the memory.
- *
- * This must only be called if we are using TCG; it will assert otherwise.
- *
- * We may take locks in the prepare call, so callers must ensure that
- * they don't exit (via longjump or otherwise) without calling complete.
- *
- * This call must only be made inside an RCU critical section.
- * (Note that while we're executing a TCG TB we're always in an
- * RCU critical section, which is likely to be the case for callers
- * of these functions.)
- */
-void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
-                                   CPUState *cpu,
-                                   vaddr mem_vaddr,
-                                   ram_addr_t ram_addr,
-                                   unsigned size);
-/**
- * memory_notdirty_write_complete: finish write to non-dirty memory
- * @ndi: pointer to the opaque NotDirtyInfo struct which was initialized
- * by memory_not_dirty_write_prepare().
- */
-void memory_notdirty_write_complete(NotDirtyInfo *ndi);
-
 #endif
 #endif
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@
 #include "exec/helper-proto.h"
 #include "qemu/atomic.h"
 #include "qemu/atomic128.h"
+#include "translate-all.h"
 
 /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
 /* #define DEBUG_TLB */
@@ -XXX,XX +XXX,XX @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
+                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
+{
+    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
+
+    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
+
+    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
+        struct page_collection *pages
+            = page_collection_lock(ram_addr, ram_addr + size);
+
+        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
+        cpu->mem_io_pc = retaddr;
+
+        tb_invalidate_phys_page_fast(pages, ram_addr, size);
+        page_collection_unlock(pages);
+    }
+
+    /*
+     * Set both VGA and migration bits for simplicity and to remove
+     * the notdirty callback faster.
+     */
+    cpu_physical_memory_set_dirty_range(ram_addr, size, DIRTY_CLIENTS_NOCODE);
+
+    /* We remove the notdirty callback only if the code has been flushed. */
+    if (!cpu_physical_memory_is_clean(ram_addr)) {
+        trace_memory_notdirty_set_dirty(mem_vaddr);
+        tlb_set_dirty(cpu, mem_vaddr);
+    }
+}
+
 /*
  * Probe for whether the specified guest access is permitted. If it is not
  * permitted then an exception will be taken in the same way as if this
@@ -XXX,XX +XXX,XX @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
  * operations, or io operations to proceed.  Return the host address.  */
 static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
-                               TCGMemOpIdx oi, uintptr_t retaddr,
-                               NotDirtyInfo *ndi)
+                               TCGMemOpIdx oi, uintptr_t retaddr)
 {
     size_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
@@ -XXX,XX +XXX,XX @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
 
     hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
 
-    ndi->active = false;
     if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
-        ndi->active = true;
-        memory_notdirty_write_prepare(ndi, env_cpu(env), addr,
-                                      qemu_ram_addr_from_host_nofail(hostaddr),
-                                      1 << s_bits);
+        notdirty_write(env_cpu(env), addr, 1 << s_bits,
+                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
     }
 
     return hostaddr;
@@ -XXX,XX +XXX,XX @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
             return;
         }
 
-        haddr = (void *)((uintptr_t)addr + entry->addend);
-
         /* Handle clean RAM pages.  */
         if (tlb_addr & TLB_NOTDIRTY) {
-            NotDirtyInfo ndi;
-
-            /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
-            env_cpu(env)->mem_io_pc = retaddr;
-
-            memory_notdirty_write_prepare(&ndi, env_cpu(env), addr,
-                                          addr + iotlbentry->addr, size);
-
-            if (unlikely(need_swap)) {
-                store_memop(haddr, val, op ^ MO_BSWAP);
-            } else {
-                store_memop(haddr, val, op);
-            }
-
-            memory_notdirty_write_complete(&ndi);
-            return;
+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
         }
 
+        haddr = (void *)((uintptr_t)addr + entry->addend);
+
         /*
          * Keep these two store_memop separate to ensure that the compiler
          * is able to fold the entire function to a single instruction.
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 #define EXTRA_ARGS     , TCGMemOpIdx oi, uintptr_t retaddr
 #define ATOMIC_NAME(X) \
     HELPER(glue(glue(glue(atomic_ ## X, SUFFIX), END), _mmu))
-#define ATOMIC_MMU_DECLS NotDirtyInfo ndi
-#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr, &ndi)
-#define ATOMIC_MMU_CLEANUP                              \
-    do {                                                \
-        if (unlikely(ndi.active)) {                     \
-            memory_notdirty_write_complete(&ndi);       \
-        }                                               \
-    } while (0)
+#define ATOMIC_MMU_DECLS
+#define ATOMIC_MMU_LOOKUP atomic_mmu_lookup(env, addr, oi, retaddr)
+#define ATOMIC_MMU_CLEANUP
 
 #define DATA_SIZE 1
 #include "atomic_template.h"
@@ -XXX,XX +XXX,XX @@ void helper_be_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
 #undef ATOMIC_MMU_LOOKUP
 #define EXTRA_ARGS         , TCGMemOpIdx oi
 #define ATOMIC_NAME(X)     HELPER(glue(glue(atomic_ ## X, SUFFIX), END))
-#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC(), &ndi)
+#define ATOMIC_MMU_LOOKUP  atomic_mmu_lookup(env, addr, oi, GETPC())
 
 #define DATA_SIZE 1
 #include "atomic_template.h"
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
     return block->offset + offset;
 }
 
-/* Called within RCU critical section. */
-void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
-                          CPUState *cpu,
-                          vaddr mem_vaddr,
-                          ram_addr_t ram_addr,
-                          unsigned size)
-{
-    ndi->cpu = cpu;
-    ndi->ram_addr = ram_addr;
-    ndi->mem_vaddr = mem_vaddr;
-    ndi->size = size;
-    ndi->pages = NULL;
-
-    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);
-
-    assert(tcg_enabled());
-    if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-        ndi->pages = page_collection_lock(ram_addr, ram_addr + size);
-        tb_invalidate_phys_page_fast(ndi->pages, ram_addr, size);
-    }
-}
-
-/* Called within RCU critical section. */
-void memory_notdirty_write_complete(NotDirtyInfo *ndi)
-{
-    if (ndi->pages) {
-        assert(tcg_enabled());
-        page_collection_unlock(ndi->pages);
-        ndi->pages = NULL;
-    }
-
-    /* Set both VGA and migration bits for simplicity and to remove
-     * the notdirty callback faster.
-     */
-    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
-                                        DIRTY_CLIENTS_NOCODE);
-    /* we remove the notdirty callback only if the code has been
-       flushed */
-    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
-        trace_memory_notdirty_set_dirty(ndi->mem_vaddr);
-        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
-    }
-}
-
 /* Generate a debug exception if a watchpoint has been hit.  */
 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                           MemTxAttrs attrs, int flags, uintptr_t ra)
-- 
2.17.1

We can use notdirty_write for the write and return a valid host
pointer for this case.

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
         return NULL;
     }
 
-    /* Handle watchpoints.  */
-    if (tlb_addr & TLB_WATCHPOINT) {
-        cpu_check_watchpoint(env_cpu(env), addr, size,
-                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
-                             wp_access, retaddr);
-    }
+    if (unlikely(tlb_addr & TLB_FLAGS_MASK)) {
+        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
 
-    /* Reject I/O access, or other required slow-path.  */
-    if (tlb_addr & (TLB_NOTDIRTY | TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
-        return NULL;
+        /* Reject I/O access, or other required slow-path.  */
+        if (tlb_addr & (TLB_MMIO | TLB_BSWAP | TLB_DISCARD_WRITE)) {
+            return NULL;
+        }
+
+        /* Handle watchpoints.  */
+        if (tlb_addr & TLB_WATCHPOINT) {
+            cpu_check_watchpoint(env_cpu(env), addr, size,
+                                 iotlbentry->attrs, wp_access, retaddr);
+        }
+
+        /* Handle clean RAM pages.  */
+        if (tlb_addr & TLB_NOTDIRTY) {
+            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
+        }
     }
 
     return (void *)((uintptr_t)addr + entry->addend);
-- 
2.17.1

With the merge of notdirty handling into store_helper,
the last user of cpu->mem_io_vaddr was removed.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/core/cpu.h | 2 --
 accel/tcg/cputlb.c    | 2 --
 hw/core/cpu.c         | 1 -
 3 files changed, 5 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -XXX,XX +XXX,XX @@ struct qemu_work_item;
  * @next_cpu: Next CPU sharing TB cache.
  * @opaque: User data.
  * @mem_io_pc: Host Program Counter at which the memory was accessed.
- * @mem_io_vaddr: Target virtual address at which the memory was accessed.
  * @kvm_fd: vCPU file descriptor for KVM.
  * @work_mutex: Lock to prevent multiple access to queued_work_*.
  * @queued_work_first: First asynchronous work pending.
@@ -XXX,XX +XXX,XX @@ struct CPUState {
      * we store some rarely used information in the CPU context.
      */
     uintptr_t mem_io_pc;
-    vaddr mem_io_vaddr;
     /*
      * This is only needed for the legacy cpu_unassigned_access() hook;
      * when all targets using it have been converted to use
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
         cpu_io_recompile(cpu, retaddr);
     }
 
-    cpu->mem_io_vaddr = addr;
     cpu->mem_io_access_type = access_type;
 
     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
@@ -XXX,XX +XXX,XX @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     if (!cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
-    cpu->mem_io_vaddr = addr;
     cpu->mem_io_pc = retaddr;
 
     if (mr->global_locking && !qemu_mutex_iothread_locked()) {
diff --git a/hw/core/cpu.c b/hw/core/cpu.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/cpu.c
+++ b/hw/core/cpu.c
@@ -XXX,XX +XXX,XX @@ static void cpu_common_reset(CPUState *cpu)
     cpu->interrupt_request = 0;
     cpu->halted = 0;
     cpu->mem_io_pc = 0;
-    cpu->mem_io_vaddr = 0;
     cpu->icount_extra = 0;
     atomic_set(&cpu->icount_decr_ptr->u32, 0);
     cpu->can_do_io = 1;
-- 
2.17.1

All callers pass false to this argument.  Remove it and pass the
constant on to tb_invalidate_phys_page_range__locked.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.h | 3 +--
 accel/tcg/translate-all.c | 6 ++----
 exec.c                    | 4 ++--
 3 files changed, 5 insertions(+), 8 deletions(-)

Rather than rely on cpu->mem_io_pc, pass retaddr down directly.

Within tb_invalidate_phys_page_range__locked, the is_cpu_write_access
parameter is non-zero exactly when retaddr would be non-zero, so that
is a simple replacement.

Recognize that current_tb_not_found is true only when mem_io_pc
(and now retaddr) are also non-zero, so remove a redundant test.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.h |  3 ++-
 accel/tcg/cputlb.c        |  6 +-----
 accel/tcg/translate-all.c | 39 +++++++++++++++++++--------------------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.h
+++ b/accel/tcg/translate-all.h
@@ -XXX,XX +XXX,XX @@ struct page_collection *page_collection_lock(tb_page_addr_t start,
                                              tb_page_addr_t end);
 void page_collection_unlock(struct page_collection *set);
 void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len);
+                                  tb_page_addr_t start, int len,
+                                  uintptr_t retaddr);
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
 void tb_check_watchpoint(CPUState *cpu);
 
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -XXX,XX +XXX,XX @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
         struct page_collection *pages
             = page_collection_lock(ram_addr, ram_addr + size);
-
-        /* We require mem_io_pc in tb_invalidate_phys_page_range.  */
-        cpu->mem_io_pc = retaddr;
-
-        tb_invalidate_phys_page_fast(pages, ram_addr, size);
+        tb_invalidate_phys_page_fast(pages, ram_addr, size, retaddr);
         page_collection_unlock(pages);
     }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static void
 tb_invalidate_phys_page_range__locked(struct page_collection *pages,
                                       PageDesc *p, tb_page_addr_t start,
                                       tb_page_addr_t end,
-                                      int is_cpu_write_access)
+                                      uintptr_t retaddr)
 {
     TranslationBlock *tb;
     tb_page_addr_t tb_start, tb_end;
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 #ifdef TARGET_HAS_PRECISE_SMC
     CPUState *cpu = current_cpu;
     CPUArchState *env = NULL;
-    int current_tb_not_found = is_cpu_write_access;
+    bool current_tb_not_found = retaddr != 0;
+    bool current_tb_modified = false;
     TranslationBlock *current_tb = NULL;
-    int current_tb_modified = 0;
     target_ulong current_pc = 0;
     target_ulong current_cs_base = 0;
     uint32_t current_flags = 0;
@@ -XXX,XX +XXX,XX @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
         if (!(tb_end <= start || tb_start >= end)) {
 #ifdef TARGET_HAS_PRECISE_SMC
             if (current_tb_not_found) {
-                current_tb_not_found = 0;
-                current_tb = NULL;
-                if (cpu->mem_io_pc) {
-                    /* now we have a real cpu fault */
-                    current_tb = tcg_tb_lookup(cpu->mem_io_pc);
-                }
+                current_tb_not_found = false;
+                /* now we have a real cpu fault */
+                current_tb = tcg_tb_lookup(retaddr);
             }
             if (current_tb == tb &&
                 (tb_cflags(current_tb) & CF_COUNT_MASK) != 1) {
-                /* If we are modifying the current TB, we must stop
-                its execution. We could be more precise by checking
-                that the modification is after the current PC, but it
-                would require a specialized function to partially
-                restore the CPU state */
-
-                current_tb_modified = 1;
-                cpu_restore_state_from_tb(cpu, current_tb,
-                                          cpu->mem_io_pc, true);
+                /*
+                 * If we are modifying the current TB, we must stop
+                 * its execution. We could be more precise by checking
+                 * that the modification is after the current PC, but it
+                 * would require a specialized function to partially
+                 * restore the CPU state.
+                 */
+                current_tb_modified = true;
+                cpu_restore_state_from_tb(cpu, current_tb, retaddr, true);
                 cpu_get_tb_cpu_state(env, &current_pc, &current_cs_base,
                                      &current_flags);
             }
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_range(target_ulong start, target_ulong end)
  * Call with all @pages in the range [@start, @start + len[ locked.
  */
 void tb_invalidate_phys_page_fast(struct page_collection *pages,
-                                  tb_page_addr_t start, int len)
+                                  tb_page_addr_t start, int len,
+                                  uintptr_t retaddr)
 {
     PageDesc *p;
 
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
         }
     } else {
     do_invalidate:
-        tb_invalidate_phys_page_range__locked(pages, p, start, start + len, 1);
+        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
+                                              retaddr);
     }
 }
 #else
-- 
2.17.1

Fixes the previous TLB_WATCHPOINT patches because we are currently
failing to set cpu->mem_io_pc with the call to cpu_check_watchpoint.
Pass down the retaddr directly because it's readily available.

Fixes: 50b107c5d61
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/translate-all.h | 2 +-
 accel/tcg/translate-all.c | 6 +++---
 exec.c                    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/translate-all.h b/accel/tcg/translate-all.h
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.h
+++ b/accel/tcg/translate-all.h
@@ -XXX,XX +XXX,XX @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
                                   tb_page_addr_t start, int len,
                                   uintptr_t retaddr);
 void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end);
-void tb_check_watchpoint(CPUState *cpu);
+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr);
 
 #ifdef CONFIG_USER_ONLY
 int page_unprotect(target_ulong address, uintptr_t pc);
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index XXXXXXX..XXXXXXX 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -XXX,XX +XXX,XX @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
 #endif
 
 /* user-mode: call with mmap_lock held */
-void tb_check_watchpoint(CPUState *cpu)
+void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
 {
     TranslationBlock *tb;
 
     assert_memory_lock();
 
-    tb = tcg_tb_lookup(cpu->mem_io_pc);
+    tb = tcg_tb_lookup(retaddr);
     if (tb) {
         /* We can use retranslation to find the PC.  */
-        cpu_restore_state_from_tb(cpu, tb, cpu->mem_io_pc, true);
+        cpu_restore_state_from_tb(cpu, tb, retaddr, true);
         tb_phys_invalidate(tb, -1);
     } else {
         /* The exception probably happened in a helper.  The CPU state should
diff --git a/exec.c b/exec.c
index XXXXXXX..XXXXXXX 100644
--- a/exec.c
+++ b/exec.c
@@ -XXX,XX +XXX,XX @@ void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
                 cpu->watchpoint_hit = wp;
 
                 mmap_lock();
-                tb_check_watchpoint(cpu);
+                tb_check_watchpoint(cpu, ra);
                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                     cpu->exception_index = EXCP_DEBUG;
                     mmap_unlock();
-- 
2.17.1

The following changes since commit c52d69e7dbaaed0ffdef8125e79218672c30161d:

Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20211027' into staging (2021-10-27 11:45:18 -0700)

are available in the Git repository at:

https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20211027

for you to fetch changes up to 820c025f0dcacf2f3c12735b1f162893fbfa7bc6:

tcg/optimize: Propagate sign info for shifting (2021-10-27 17:11:23 -0700)

----------------------------------------------------------------
Improvements to qemu/int128
Fixes for 128/64 division.
Cleanup tcg/optimize.c
Optimize redundant sign extensions

----------------------------------------------------------------
Frédéric Pétrot (1):
      qemu/int128: Add int128_{not,xor}

Luis Pires (4):
      host-utils: move checks out of divu128/divs128
      host-utils: move udiv_qrnnd() to host-utils
      host-utils: add 128-bit quotient support to divu128/divs128
      host-utils: add unit tests for divu128/divs128

Richard Henderson (51):
      tcg/optimize: Rename "mask" to "z_mask"
      tcg/optimize: Split out OptContext
      tcg/optimize: Remove do_default label
      tcg/optimize: Change tcg_opt_gen_{mov,movi} interface
      tcg/optimize: Move prev_mb into OptContext
      tcg/optimize: Split out init_arguments
      tcg/optimize: Split out copy_propagate
      tcg/optimize: Split out fold_call
      tcg/optimize: Drop nb_oargs, nb_iargs locals
      tcg/optimize: Change fail return for do_constant_folding_cond*
      tcg/optimize: Return true from tcg_opt_gen_{mov,movi}
      tcg/optimize: Split out finish_folding
      tcg/optimize: Use a boolean to avoid a mass of continues
      tcg/optimize: Split out fold_mb, fold_qemu_{ld,st}
      tcg/optimize: Split out fold_const{1,2}
      tcg/optimize: Split out fold_setcond2
      tcg/optimize: Split out fold_brcond2
      tcg/optimize: Split out fold_brcond
      tcg/optimize: Split out fold_setcond
      tcg/optimize: Split out fold_mulu2_i32
      tcg/optimize: Split out fold_addsub2_i32
      tcg/optimize: Split out fold_movcond
      tcg/optimize: Split out fold_extract2
      tcg/optimize: Split out fold_extract, fold_sextract
      tcg/optimize: Split out fold_deposit
      tcg/optimize: Split out fold_count_zeros
      tcg/optimize: Split out fold_bswap
      tcg/optimize: Split out fold_dup, fold_dup2
      tcg/optimize: Split out fold_mov
      tcg/optimize: Split out fold_xx_to_i
      tcg/optimize: Split out fold_xx_to_x
      tcg/optimize: Split out fold_xi_to_i
      tcg/optimize: Add type to OptContext
      tcg/optimize: Split out fold_to_not
      tcg/optimize: Split out fold_sub_to_neg
      tcg/optimize: Split out fold_xi_to_x
      tcg/optimize: Split out fold_ix_to_i
      tcg/optimize: Split out fold_masks
      tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies
      tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops
      tcg/optimize: Sink commutative operand swapping into fold functions
      tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values
      tcg/optimize: Use fold_xx_to_i for orc
      tcg/optimize: Use fold_xi_to_x for mul
      tcg/optimize: Use fold_xi_to_x for div
      tcg/optimize: Use fold_xx_to_i for rem
      tcg/optimize: Optimize sign extensions
      tcg/optimize: Propagate sign info for logical operations
      tcg/optimize: Propagate sign info for setcond
      tcg/optimize: Propagate sign info for bit counting
      tcg/optimize: Propagate sign info for shifting

From: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>

Addition of not and xor on 128-bit integers.

Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
Message-Id: <20211025122818.168890-3-frederic.petrot@univ-grenoble-alpes.fr>
[rth: Split out logical operations.]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/int128.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/qemu/int128.h b/include/qemu/int128.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/int128.h
+++ b/include/qemu/int128.h
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return a;
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return ~a;
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return a & b;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return a | b;
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return a ^ b;
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     return a >> n;
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_exts64(int64_t a)
     return int128_make128(a, (a < 0) ? -1 : 0);
 }
 
+static inline Int128 int128_not(Int128 a)
+{
+    return int128_make128(~a.lo, ~a.hi);
+}
+
 static inline Int128 int128_and(Int128 a, Int128 b)
 {
     return int128_make128(a.lo & b.lo, a.hi & b.hi);
@@ -XXX,XX +XXX,XX @@ static inline Int128 int128_or(Int128 a, Int128 b)
     return int128_make128(a.lo | b.lo, a.hi | b.hi);
 }
 
+static inline Int128 int128_xor(Int128 a, Int128 b)
+{
+    return int128_make128(a.lo ^ b.lo, a.hi ^ b.hi);
+}
+
 static inline Int128 int128_rshift(Int128 a, int n)
 {
     int64_t h;
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

In preparation for changing the divu128/divs128 implementations
to allow for quotients larger than 64 bits, move the div-by-zero
and overflow checks to the callers.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-2-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |  5 +++--
 include/qemu/host-utils.h | 34 ++++++++++++---------------------
 target/ppc/int_helper.c   | 14 +++++++++-----
 util/host-utils.c         | 40 ++++++++++++++++++---------------------
 4 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
         return 0;
     }
     /*
-     * Ignore divu128() return value as we've caught div-by-zero and don't
-     * need different behaviour for overflow.
+     * BUG: when CONFIG_INT128 is not defined, the current implementation of
+     * divu128 does not return a valid truncated quotient, so the result will
+     * be wrong.
      */
     divu128(&lo, &hi, clk->period);
     return lo;
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
-        __uint128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result > UINT64_MAX;
-    }
+    __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
+    __uint128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 
-static inline int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    if (divisor == 0) {
-        return 1;
-    } else {
-        __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
-        __int128_t result = dividend / divisor;
-        *plow = result;
-        *phigh = dividend % divisor;
-        return result != *plow;
-    }
+    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t result = dividend / divisor;
+    *plow = result;
+    *phigh = dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
     uint64_t rt = 0;
     int overflow = 0;
 
-    overflow = divu128(&rt, &ra, rb);
-
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || ra >= rb)) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divu128(&rt, &ra, rb);
     }
 
     if (oe) {
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
     int64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
-    int overflow = divs128(&rt, &ra, rb);
+    int overflow = 0;
 
-    if (unlikely(overflow)) {
+    if (unlikely(rb == 0 || uabs64(ra) >= uabs64(rb))) {
+        overflow = 1;
         rt = 0; /* Undefined */
+    } else {
+        divs128(&rt, &ra, rb);
     }
 
     if (oe) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
     *phigh = rh;
 }
 
-/* Unsigned 128x64 division.  Returns 1 if overflow (divide by zero or */
-/* quotient exceeds 64 bits).  Otherwise returns quotient via plow and */
-/* remainder via phigh. */
-int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+/*
+ * Unsigned 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
     unsigned i;
     uint64_t carry = 0;
 
-    if (divisor == 0) {
-        return 1;
-    } else if (dhi == 0) {
+    if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
         *phigh = dlo % divisor;
-        return 0;
-    } else if (dhi >= divisor) {
-        return 1;
     } else {
 
         for (i = 0; i < 64; i++) {
@@ -XXX,XX +XXX,XX @@ int divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 
         *plow = dlo;
         *phigh = dhi;
-        return 0;
     }
 }
 
-int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+/*
+ * Signed 128-by-64 division. Returns quotient via plow and
+ * remainder via phigh.
+ * The result must fit in 64 bits (plow) - otherwise, the result
+ * is undefined.
+ * This function will cause a division by zero if passed a zero divisor.
+ */
+void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
 {
     int sgn_dvdnd = *phigh < 0;
     int sgn_divsr = divisor < 0;
-    int overflow = 0;
 
     if (sgn_dvdnd) {
         *plow = ~(*plow);
@@ -XXX,XX +XXX,XX @@ int divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
         divisor = 0 - divisor;
     }
 
-    overflow = divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
+    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
 
     if (sgn_dvdnd  ^ sgn_divsr) {
         *plow = 0 - *plow;
     }
-
-    if (!overflow) {
-        if ((*plow < 0) ^ (sgn_dvdnd ^ sgn_divsr)) {
-            overflow = 1;
-        }
-    }
-
-    return overflow;
 }
 #endif
 
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Move udiv_qrnnd() from include/fpu/softfloat-macros.h to host-utils,
so it can be reused by divu128().

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-3-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat-macros.h | 82 ----------------------------------
 include/qemu/host-utils.h      | 81 +++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/fpu/softfloat-macros.h b/include/fpu/softfloat-macros.h
index XXXXXXX..XXXXXXX 100644
--- a/include/fpu/softfloat-macros.h
+++ b/include/fpu/softfloat-macros.h
@@ -XXX,XX +XXX,XX @@
  * so some portions are provided under:
  *  the SoftFloat-2a license
  *  the BSD license
- *  GPL-v2-or-later
  *
  * Any future contributions to this file after December 1st 2014 will be
  * taken to be licensed under the Softfloat-2a license unless specifically
@@ -XXX,XX +XXX,XX @@ this code that are retained.
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* Portions of this work are licensed under the terms of the GNU GPL,
- * version 2 or later. See the COPYING file in the top-level directory.
- */
-
 #ifndef FPU_SOFTFLOAT_MACROS_H
 #define FPU_SOFTFLOAT_MACROS_H
 
@@ -XXX,XX +XXX,XX @@ static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 
 }
 
-/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
- * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
- *
- * Licensed under the GPLv2/LGPLv3
- */
-static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
-                                  uint64_t n0, uint64_t d)
-{
-#if defined(__x86_64__)
-    uint64_t q;
-    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
-    return q;
-#elif defined(__s390x__) && !defined(__clang__)
-    /* Need to use a TImode type to get an even register pair for DLGR.  */
-    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
-    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
-    *r = n >> 64;
-    return n;
-#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
-    /* From Power ISA 2.06, programming note for divdeu.  */
-    uint64_t q1, q2, Q, r1, r2, R;
-    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
-        : "=&r"(q1), "=r"(q2)
-        : "r"(n1), "r"(n0), "r"(d));
-    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
-    r2 = n0 - (q2 * d);
-    Q = q1 + q2;
-    R = r1 + r2;
-    if (R >= d || R < r2) { /* overflow implies R > d */
-        Q += 1;
-        R -= d;
-    }
-    *r = R;
-    return Q;
-#else
-    uint64_t d0, d1, q0, q1, r1, r0, m;
-
-    d0 = (uint32_t)d;
-    d1 = d >> 32;
-
-    r1 = n1 % d1;
-    q1 = n1 / d1;
-    m = q1 * d0;
-    r1 = (r1 << 32) | (n0 >> 32);
-    if (r1 < m) {
-        q1 -= 1;
-        r1 += d;
-        if (r1 >= d) {
-            if (r1 < m) {
-                q1 -= 1;
-                r1 += d;
-            }
-        }
-    }
-    r1 -= m;
-
-    r0 = r1 % d1;
-    q0 = r1 / d1;
-    m = q0 * d0;
-    r0 = (r0 << 32) | (uint32_t)n0;
-    if (r0 < m) {
-        q0 -= 1;
-        r0 += d;
-        if (r0 >= d) {
-            if (r0 < m) {
-                q0 -= 1;
-                r0 += d;
-            }
-        }
-    }
-    r0 -= m;
-
-    *r = r0;
-    return (q1 << 32) | q0;
-#endif
-}
-
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@
  * THE SOFTWARE.
  */
 
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
 #ifndef HOST_UTILS_H
 #define HOST_UTILS_H
 
@@ -XXX,XX +XXX,XX @@ void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift);
  */
 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow);
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
+                                  uint64_t n0, uint64_t d)
+{
+#if defined(__x86_64__)
+    uint64_t q;
+    asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
+    return q;
+#elif defined(__s390x__) && !defined(__clang__)
+    /* Need to use a TImode type to get an even register pair for DLGR.  */
+    unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
+    asm("dlgr %0, %1" : "+r"(n) : "r"(d));
+    *r = n >> 64;
+    return n;
+#elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
+    /* From Power ISA 2.06, programming note for divdeu.  */
+    uint64_t q1, q2, Q, r1, r2, R;
+    asm("divdeu %0,%2,%4; divdu %1,%3,%4"
+        : "=&r"(q1), "=r"(q2)
+        : "r"(n1), "r"(n0), "r"(d));
+    r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
+    r2 = n0 - (q2 * d);
+    Q = q1 + q2;
+    R = r1 + r2;
+    if (R >= d || R < r2) { /* overflow implies R > d */
+        Q += 1;
+        R -= d;
+    }
+    *r = R;
+    return Q;
+#else
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    *r = r0;
+    return (q1 << 32) | q0;
+#endif
+}
+
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

These will be used to implement new decimal floating point
instructions from Power ISA 3.1.

The remainder is now returned directly by divu128/divs128,
freeing up phigh to receive the high 64 bits of the quotient.

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-4-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/hw/clock.h        |   6 +-
 include/qemu/host-utils.h |  20 ++++--
 target/ppc/int_helper.c   |   9 +--
 util/host-utils.c         | 133 +++++++++++++++++++++++++-------------
 4 files changed, 108 insertions(+), 60 deletions(-)

diff --git a/include/hw/clock.h b/include/hw/clock.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t clock_ns_to_ticks(const Clock *clk, uint64_t ns)
     if (clk->period == 0) {
         return 0;
     }
-    /*
-     * BUG: when CONFIG_INT128 is not defined, the current implementation of
-     * divu128 does not return a valid truncated quotient, so the result will
-     * be wrong.
-     */
+
     divu128(&lo, &hi, clk->period);
     return lo;
 }
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -XXX,XX +XXX,XX @@ static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
     return (__int128_t)a * b / c;
 }
 
-static inline void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh,
+                               uint64_t divisor)
 {
     __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow;
     __uint128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 
-static inline void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+static inline int64_t divs128(uint64_t *plow, int64_t *phigh,
+                              int64_t divisor)
 {
-    __int128_t dividend = ((__int128_t)*phigh << 64) | (uint64_t)*plow;
+    __int128_t dividend = ((__int128_t)*phigh << 64) | *plow;
     __int128_t result = dividend / divisor;
+
     *plow = result;
-    *phigh = dividend % divisor;
+    *phigh = result >> 64;
+    return dividend % divisor;
 }
 #else
 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b);
 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b);
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor);
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor);
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor);
 
 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index XXXXXXX..XXXXXXX 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -XXX,XX +XXX,XX @@ uint64_t helper_divdeu(CPUPPCState *env, uint64_t ra, uint64_t rb, uint32_t oe)
 
 uint64_t helper_divde(CPUPPCState *env, uint64_t rau, uint64_t rbu, uint32_t oe)
 {
-    int64_t rt = 0;
+    uint64_t rt = 0;
     int64_t ra = (int64_t)rau;
     int64_t rb = (int64_t)rbu;
     int overflow = 0;
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
     int cr;
     uint64_t lo_value;
     uint64_t hi_value;
+    uint64_t rem;
     ppc_avr_t ret = { .u64 = { 0, 0 } };
 
     if (b->VsrSD(0) < 0) {
@@ -XXX,XX +XXX,XX @@ uint32_t helper_bcdcfsq(ppc_avr_t *r, ppc_avr_t *b, uint32_t ps)
          * In that case, we leave r unchanged.
          */
     } else {
-        divu128(&lo_value, &hi_value, 1000000000000000ULL);
+        rem = divu128(&lo_value, &hi_value, 1000000000000000ULL);
 
-        for (i = 1; i < 16; hi_value /= 10, i++) {
-            bcd_put_digit(&ret, hi_value % 10, i);
+        for (i = 1; i < 16; rem /= 10, i++) {
+            bcd_put_digit(&ret, rem % 10, i);
         }
 
         for (; i < 32; lo_value /= 10, i++) {
diff --git a/util/host-utils.c b/util/host-utils.c
index XXXXXXX..XXXXXXX 100644
--- a/util/host-utils.c
+++ b/util/host-utils.c
@@ -XXX,XX +XXX,XX @@ void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
 }
 
 /*
- * Unsigned 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
 {
     uint64_t dhi = *phigh;
     uint64_t dlo = *plow;
-    unsigned i;
-    uint64_t carry = 0;
+    uint64_t rem, dhighest;
+    int sh;
 
     if (divisor == 0 || dhi == 0) {
         *plow  = dlo / divisor;
-        *phigh = dlo % divisor;
+        *phigh = 0;
+        return dlo % divisor;
     } else {
+        sh = clz64(divisor);
 
-        for (i = 0; i < 64; i++) {
-            carry = dhi >> 63;
-            dhi = (dhi << 1) | (dlo >> 63);
-            if (carry || (dhi >= divisor)) {
-                dhi -= divisor;
-                carry = 1;
-            } else {
-                carry = 0;
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
             }
-            dlo = (dlo << 1) | carry;
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
         }
 
-        *plow = dlo;
-        *phigh = dhi;
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
     }
 }
 
 /*
- * Signed 128-by-64 division. Returns quotient via plow and
- * remainder via phigh.
- * The result must fit in 64 bits (plow) - otherwise, the result
- * is undefined.
- * This function will cause a division by zero if passed a zero divisor.
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
  */
-void divs128(int64_t *plow, int64_t *phigh, int64_t divisor)
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
 {
-    int sgn_dvdnd = *phigh < 0;
-    int sgn_divsr = divisor < 0;
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
 
-    if (sgn_dvdnd) {
-        *plow = ~(*plow);
-        *phigh = ~(*phigh);
-        if (*plow == (int64_t)-1) {
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
             *plow = 0;
-            (*phigh)++;
-         } else {
-            (*plow)++;
-         }
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
     }
 
-    if (sgn_divsr) {
-        divisor = 0 - divisor;
-    }
-
-    divu128((uint64_t *)plow, (uint64_t *)phigh, (uint64_t)divisor);
-
-    if (sgn_dvdnd  ^ sgn_divsr) {
-        *plow = 0 - *plow;
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
     }
 }
 #endif
-- 
2.25.1

From: Luis Pires <luis.pires@eldorado.org.br>

Signed-off-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20211025191154.350831-5-luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/unit/test-div128.c | 197 +++++++++++++++++++++++++++++++++++++++
 tests/unit/meson.build   |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 tests/unit/test-div128.c

diff --git a/tests/unit/test-div128.c b/tests/unit/test-div128.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/unit/test-div128.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * Test 128-bit division functions
+ *
+ * Copyright (c) 2021 Instituto de Pesquisas Eldorado (eldorado.org.br)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+typedef struct {
+    uint64_t high;
+    uint64_t low;
+    uint64_t rhigh;
+    uint64_t rlow;
+    uint64_t divisor;
+    uint64_t remainder;
+} test_data_unsigned;
+
+typedef struct {
+    int64_t high;
+    uint64_t low;
+    int64_t rhigh;
+    uint64_t rlow;
+    int64_t divisor;
+    int64_t remainder;
+} test_data_signed;
+
+static const test_data_unsigned test_table_unsigned[] = {
+    /* Dividend fits in 64 bits */
+    { 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0x0000000000000003ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x0000000000000002ULL, 0x0000000000000001ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x0000000000000000ULL, 0xa000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000002ULL,
+      0x4000000000000000ULL, 0x2000000000000000ULL},
+    { 0x0000000000000000ULL, 0x8000000000000000ULL,
+      0x0000000000000000ULL, 0x0000000000000001ULL,
+      0x8000000000000000ULL, 0x0000000000000000ULL},
+
+    /* Dividend > 64 bits, with MSB 0 */
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0000000000000001ULL, 0x000000000000000dULL,
+      0x123456789abcdefeULL, 0x03456789abcdf03bULL},
+    { 0x123456789abcdefeULL, 0xefedcba987654321ULL,
+      0x0123456789abcdefULL, 0xeefedcba98765432ULL,
+      0x0000000000000010ULL, 0x0000000000000001ULL},
+
+    /* Dividend > 64 bits, with MSB 1 */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0feeddccbbaa9988ULL, 0x7766554433221100ULL,
+      0x0000000000000010ULL, 0x000000000000000fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x000000000000000eULL, 0x00f0f0f0f0f0f35aULL,
+      0x123456789abcdefeULL, 0x0f8922bc55ef90c3ULL},
+
+    /**
+     * Divisor == 64 bits, with MSB 1
+     * and high 64 bits of dividend >= divisor
+     * (for testing normalization)
+     */
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0x0000000000000000ULL,
+      0xfeeddccbbaa99887ULL, 0x766554433221100fULL},
+    { 0xfeeddccbbaa99887ULL, 0x766554433221100fULL,
+      0x0000000000000001ULL, 0xfddbb9977553310aULL,
+      0x8000000000000001ULL, 0x78899aabbccddf05ULL},
+
+    /* Dividend > 64 bits, divisor almost as big */
+    { 0x0000000000000001ULL, 0x23456789abcdef01ULL,
+      0x0000000000000000ULL, 0x000000000000000fULL,
+      0x123456789abcdefeULL, 0x123456789abcde1fULL},
+};
+
+static const test_data_signed test_table_signed[] = {
+    /* Positive dividend, positive/negative divisors */
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0x0000000000000008LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0xfffffffffffffff8LL, 0x0000000000000006LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0x0000000000000237LL, 0x0000000000000183LL},
+    { 0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0xfffffffffffffdc9LL, 0x0000000000000183LL},
+
+    /* Negative dividend, positive/negative divisors */
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000001LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000bc614eULL,
+      0xffffffffffffffffLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffa1cf59ULL,
+      0x0000000000000002LL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x00000000005e30a7ULL,
+      0xfffffffffffffffeLL, 0x0000000000000000LL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffe873d7ULL,
+      0x0000000000000008LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x0000000000178c29ULL,
+      0xfffffffffffffff8LL, 0xfffffffffffffffaLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0xffffffffffffffffLL, 0xffffffffffffaaf3ULL,
+      0x0000000000000237LL, 0xfffffffffffffe7dLL},
+    { 0xffffffffffffffffLL, 0xffffffffff439eb2ULL,
+      0x0000000000000000LL, 0x000000000000550dULL,
+      0xfffffffffffffdc9LL, 0xfffffffffffffe7dLL},
+};
+
+static void test_divu128(void)
+{
+    int i;
+    uint64_t rem;
+    test_data_unsigned tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_unsigned); ++i) {
+        tmp = test_table_unsigned[i];
+
+        rem = divu128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+static void test_divs128(void)
+{
+    int i;
+    int64_t rem;
+    test_data_signed tmp;
+
+    for (i = 0; i < ARRAY_SIZE(test_table_signed); ++i) {
+        tmp = test_table_signed[i];
+
+        rem = divs128(&tmp.low, &tmp.high, tmp.divisor);
+        g_assert_cmpuint(tmp.low, ==, tmp.rlow);
+        g_assert_cmpuint(tmp.high, ==, tmp.rhigh);
+        g_assert_cmpuint(rem, ==, tmp.remainder);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/host-utils/test_divu128", test_divu128);
+    g_test_add_func("/host-utils/test_divs128", test_divs128);
+    return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -XXX,XX +XXX,XX @@ tests = {
   # all code tested by test-x86-cpuid is inside topology.h
   'test-x86-cpuid': [],
   'test-cutils': [],
+  'test-div128': [],
   'test-shift128': [],
   'test-mul64': [],
   # all code tested by test-int128 is inside int128.h
-- 
2.25.1

Prepare for tracking different masks by renaming this one.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 +++++++++++++++++++++++++------------------------
 1 file changed, 72 insertions(+), 70 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *prev_copy;
     TCGTemp *next_copy;
     uint64_t val;
-    uint64_t mask;
+    uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->next_copy = ts;
     ti->prev_copy = ts;
     ti->is_const = false;
-    ti->mask = -1;
+    ti->z_mask = -1;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     if (ts->kind == TEMP_CONST) {
         ti->is_const = true;
         ti->val = ts->val;
-        ti->mask = ts->val;
+        ti->z_mask = ts->val;
         if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
             /* High bits of a 32-bit quantity are garbage.  */
-            ti->mask |= ~0xffffffffull;
+            ti->z_mask |= ~0xffffffffull;
         }
     } else {
         ti->is_const = false;
-        ti->mask = -1;
+        ti->z_mask = -1;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t mask;
+    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    mask = si->mask;
+    z_mask = si->z_mask;
     if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
         /* High bits of the destination are now garbage.  */
-        mask |= ~0xffffffffull;
+        z_mask |= ~0xffffffffull;
     }
-    di->mask = mask;
+    di->z_mask = z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t mask, partmask, affected, tmp;
+        uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
-        mask = -1;
+        z_mask = -1;
         affected = -1;
         switch (opc) {
         CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->mask & 0x80) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext8u):
-            mask = 0xff;
+            z_mask = 0xff;
             goto and_const;
         CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->mask & 0x8000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         CASE_OP_32_64(ext16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             goto and_const;
         case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_ext32u_i64:
-            mask = 0xffffffffU;
+            z_mask = 0xffffffffU;
             goto and_const;
 
         CASE_OP_32_64(and):
-            mask = arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[2])->z_mask;
             if (arg_is_const(op->args[2])) {
         and_const:
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
-            mask = arg_info(op->args[1])->mask & mask;
+            z_mask = arg_info(op->args[1])->z_mask & z_mask;
             break;
 
         case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->mask & 0x80000000) != 0) {
+            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
                 break;
             }
             QEMU_FALLTHROUGH;
         case INDEX_op_extu_i32_i64:
             /* We do not compute affected as it is a size changing op.  */
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
 
         CASE_OP_32_64(andc):
             /* Known-zeros does not imply known-ones.  Therefore unless
                op->args[2] is constant, we can't infer anything from it.  */
             if (arg_is_const(op->args[2])) {
-                mask = ~arg_info(op->args[2])->mask;
+                z_mask = ~arg_info(op->args[2])->z_mask;
                 goto and_const;
             }
             /* But we certainly know nothing outside args[1] may be set. */
-            mask = arg_info(op->args[1])->mask;
+            z_mask = arg_info(op->args[1])->z_mask;
             break;
 
         case INDEX_op_sar_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (int32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_sar_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (int64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_shr_i32:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 31;
-                mask = (uint32_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
         case INDEX_op_shr_i64:
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & 63;
-                mask = (uint64_t)arg_info(op->args[1])->mask >> tmp;
+                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
             }
             break;
 
         case INDEX_op_extrl_i64_i32:
-            mask = (uint32_t)arg_info(op->args[1])->mask;
+            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
             break;
         case INDEX_op_extrh_i64_i32:
-            mask = (uint64_t)arg_info(op->args[1])->mask >> 32;
+            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
             break;
 
         CASE_OP_32_64(shl):
             if (arg_is_const(op->args[2])) {
                 tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                mask = arg_info(op->args[1])->mask << tmp;
+                z_mask = arg_info(op->args[1])->z_mask << tmp;
             }
             break;
 
         CASE_OP_32_64(neg):
             /* Set to 1 all bits to the left of the rightmost.  */
-            mask = -(arg_info(op->args[1])->mask
-                     & -arg_info(op->args[1])->mask);
+            z_mask = -(arg_info(op->args[1])->z_mask
+                       & -arg_info(op->args[1])->z_mask);
             break;
 
         CASE_OP_32_64(deposit):
-            mask = deposit64(arg_info(op->args[1])->mask,
-                             op->args[3], op->args[4],
-                             arg_info(op->args[2])->mask);
+            z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                               op->args[3], op->args[4],
+                               arg_info(op->args[2])->z_mask);
             break;
 
         CASE_OP_32_64(extract):
-            mask = extract64(arg_info(op->args[1])->mask,
-                             op->args[2], op->args[3]);
+            z_mask = extract64(arg_info(op->args[1])->z_mask,
+                               op->args[2], op->args[3]);
             if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
         CASE_OP_32_64(sextract):
-            mask = sextract64(arg_info(op->args[1])->mask,
-                              op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)mask >= 0) {
-                affected = arg_info(op->args[1])->mask & ~mask;
+            z_mask = sextract64(arg_info(op->args[1])->z_mask,
+                                op->args[2], op->args[3]);
+            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
+                affected = arg_info(op->args[1])->z_mask & ~z_mask;
             }
             break;
 
         CASE_OP_32_64(or):
         CASE_OP_32_64(xor):
-            mask = arg_info(op->args[1])->mask | arg_info(op->args[2])->mask;
+            z_mask = arg_info(op->args[1])->z_mask
+                   | arg_info(op->args[2])->z_mask;
             break;
 
         case INDEX_op_clz_i32:
         case INDEX_op_ctz_i32:
-            mask = arg_info(op->args[2])->mask | 31;
+            z_mask = arg_info(op->args[2])->z_mask | 31;
             break;
 
         case INDEX_op_clz_i64:
         case INDEX_op_ctz_i64:
-            mask = arg_info(op->args[2])->mask | 63;
+            z_mask = arg_info(op->args[2])->z_mask | 63;
             break;
 
         case INDEX_op_ctpop_i32:
-            mask = 32 | 31;
+            z_mask = 32 | 31;
             break;
         case INDEX_op_ctpop_i64:
-            mask = 64 | 63;
+            z_mask = 64 | 63;
             break;
 
         CASE_OP_32_64(setcond):
         case INDEX_op_setcond2_i32:
-            mask = 1;
+            z_mask = 1;
             break;
 
         CASE_OP_32_64(movcond):
-            mask = arg_info(op->args[3])->mask | arg_info(op->args[4])->mask;
+            z_mask = arg_info(op->args[3])->z_mask
+                   | arg_info(op->args[4])->z_mask;
             break;
 
         CASE_OP_32_64(ld8u):
-            mask = 0xff;
+            z_mask = 0xff;
             break;
         CASE_OP_32_64(ld16u):
-            mask = 0xffff;
+            z_mask = 0xffff;
             break;
         case INDEX_op_ld32u_i64:
-            mask = 0xffffffffu;
+            z_mask = 0xffffffffu;
             break;
 
         CASE_OP_32_64(qemu_ld):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 MemOpIdx oi = op->args[nb_oargs + nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
-                    mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
+                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
                 }
             }
             break;
 
         CASE_OP_32_64(bswap16):
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffff) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffff) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap16(mask);
+            z_mask = bswap16(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int16_t)mask;
+                z_mask = (int16_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(16, 48);
+                z_mask |= MAKE_64BIT_MASK(16, 48);
                 break;
             }
             break;
 
         case INDEX_op_bswap32_i64:
-            mask = arg_info(op->args[1])->mask;
-            if (mask <= 0xffffffffu) {
+            z_mask = arg_info(op->args[1])->z_mask;
+            if (z_mask <= 0xffffffffu) {
                 op->args[2] |= TCG_BSWAP_IZ;
             }
-            mask = bswap32(mask);
+            z_mask = bswap32(z_mask);
             switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
             case TCG_BSWAP_OZ:
                 break;
             case TCG_BSWAP_OS:
-                mask = (int32_t)mask;
+                z_mask = (int32_t)z_mask;
                 break;
             default: /* undefined high bits */
-                mask |= MAKE_64BIT_MASK(32, 32);
+                z_mask |= MAKE_64BIT_MASK(32, 32);
                 break;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         /* 32-bit ops generate 32-bit results.  For the result is zero test
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
-        partmask = mask;
+        partmask = z_mask;
         if (!(def->flags & TCG_OPF_64BIT)) {
-            mask |= ~(tcg_target_ulong)0xffffffffu;
+            z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                    vs the high word of the input.  */
             do_setcond_high:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
-                arg_info(op->args[0])->mask = 1;
+                arg_info(op->args[0])->z_mask = 1;
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             /* Default case: we know nothing about operation (or were unable
                to compute the operation result) so no propagation is done.
                We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "mask" is
+               block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
                 memset(&temps_used, 0, sizeof(temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Save the corresponding known-zero bits mask for the
                        first output argument (only one supported so far). */
                     if (i == 0) {
-                        arg_info(op->args[i])->mask = mask;
+                        arg_info(op->args[i])->z_mask = z_mask;
                     }
                 }
             }
-- 
2.25.1

Provide what will become a larger context for splitting
the very large tcg_optimize function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 77 ++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
 } TempOptInfo;
 
+typedef struct OptContext {
+    TCGTempSet temps_used;
+} OptContext;
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_temp(TCGArg arg)
 }
 
 /* Initialize and activate a temporary.  */
-static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
+static void init_ts_info(OptContext *ctx, TCGTemp *ts)
 {
     size_t idx = temp_idx(ts);
     TempOptInfo *ti;
 
-    if (test_bit(idx, temps_used->l)) {
+    if (test_bit(idx, ctx->temps_used.l)) {
         return;
     }
-    set_bit(idx, temps_used->l);
+    set_bit(idx, ctx->temps_used.l);
 
     ti = ts->state_ptr;
     if (ti == NULL) {
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(TCGTempSet *temps_used, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(TCGTempSet *temps_used, TCGArg arg)
+static void init_arg_info(OptContext *ctx, TCGArg arg)
 {
-    init_ts_info(temps_used, arg_temp(arg));
+    init_ts_info(ctx, arg_temp(arg));
 }
 
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
+static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
                              TCGOp *op, TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, TCGTempSet *temps_used,
 
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
-    init_ts_info(temps_used, tv);
+    init_ts_info(ctx, tv);
     tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
 }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    TCGTempSet temps_used;
+    OptContext ctx = {};
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     nb_temps = s->nb_temps;
     nb_globals = s->nb_globals;
 
-    memset(&temps_used, 0, sizeof(temps_used));
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
                 TCGTemp *ts = arg_temp(op->args[i]);
                 if (ts) {
-                    init_ts_info(&temps_used, ts);
+                    init_ts_info(&ctx, ts);
                 }
             }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
             for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&temps_used, op->args[i]);
+                init_arg_info(&ctx, op->args[i]);
             }
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], 0);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0],
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                            op->args[1], op->args[2]);
             if (tmp != 2) {
                 if (tmp) {
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[3];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &temps_used, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &temps_used, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
                 break;
             }
             goto do_default;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
                 if (tmp) {
             do_brcond_true:
-                    memset(&temps_used, 0, sizeof(temps_used));
+                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                     op->opc = INDEX_op_br;
                     op->args[0] = op->args[5];
                 } else {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     goto do_default;
                 }
             do_brcond_low:
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = INDEX_op_brcond_i32;
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &temps_used, op, op->args[0], tmp);
+                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
             } else if ((op->args[5] == TCG_COND_LT
                         || op->args[5] == TCG_COND_GE)
                        && arg_is_const(op->args[3])
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, temps_used.l)) {
+                    if (test_bit(i, ctx.temps_used.l)) {
                         reset_ts(&s->temps[i]);
                     }
                 }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                block, otherwise we only trash the output args.  "z_mask" is
                the non-zero bits mask for the first output arg.  */
             if (def->flags & TCG_OPF_BB_END) {
-                memset(&temps_used, 0, sizeof(temps_used));
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
             } else {
         do_reset_output:
                 for (i = 0; i < nb_oargs; i++) {
-- 
2.25.1

Break the final cleanup clause out of the main switch
statement.  When fully folding an opcode to mov/movi,
use "continue" to process the next opcode, else break
to fall into the final cleanup.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 190 ++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 96 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
             tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
-            break;
+            continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
-                break;
+                continue;
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
                 nb_iargs = 1;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(not):
         CASE_OP_32_64(neg):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(add):
         CASE_OP_32_64(sub):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else {
                     tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
                 }
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(deposit):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract):
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(sextract):
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(extract2):
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                     ((uint32_t)v2 << (32 - shr)));
                 }
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(setcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(brcond):
             tmp = do_constant_folding_cond(opc, op->args[0],
                                            op->args[1], op->args[2]);
-            if (tmp != 2) {
-                if (tmp) {
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[3];
-                } else {
-                    tcg_op_remove(s, op);
-                }
+            switch (tmp) {
+            case 0:
+                tcg_op_remove(s, op);
+                continue;
+            case 1:
+                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[3];
                 break;
             }
-            goto do_default;
+            break;
 
         CASE_OP_32_64(movcond):
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
                 tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
-                break;
+                continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
                 uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (fv == 1 && tv == 0) {
                     cond = tcg_invert_cond(cond);
                 } else if (!(tv == 1 && fv == 0)) {
-                    goto do_default;
+                    break;
                 }
                 op->args[3] = cond;
                 op->opc = opc = (opc == INDEX_op_movcond_i32
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                  : INDEX_op_setcond_i64);
                 nb_iargs = 2;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_add2_i32:
         case INDEX_op_sub2_i32:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_mulu2_i32:
             if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 rh = op->args[1];
                 tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
                 tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
-                break;
+                continue;
             }
-            goto do_default;
+            break;
 
         case INDEX_op_brcond2_i32:
             tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
                                             op->args[4]);
-            if (tmp != 2) {
-                if (tmp) {
-            do_brcond_true:
-                    memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                    op->opc = INDEX_op_br;
-                    op->args[0] = op->args[5];
-                } else {
+            if (tmp == 0) {
             do_brcond_false:
-                    tcg_op_remove(s, op);
-                }
-            } else if ((op->args[4] == TCG_COND_LT
-                        || op->args[4] == TCG_COND_GE)
-                       && arg_is_const(op->args[2])
-                       && arg_info(op->args[2])->val == 0
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0) {
+                tcg_op_remove(s, op);
+                continue;
+            }
+            if (tmp == 1) {
+            do_brcond_true:
+                op->opc = opc = INDEX_op_br;
+                op->args[0] = op->args[5];
+                break;
+            }
+            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
+                 && arg_is_const(op->args[2])
+                 && arg_info(op->args[2])->val == 0
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_brcond_high:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
+                op->opc = opc = INDEX_op_brcond_i32;
                 op->args[0] = op->args[1];
                 op->args[1] = op->args[3];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_brcond_false;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_brcond_low:
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[4] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_brcond_true;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (tmp != 2) {
             do_setcond_const:
                 tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
-            } else if ((op->args[5] == TCG_COND_LT
-                        || op->args[5] == TCG_COND_GE)
-                       && arg_is_const(op->args[3])
-                       && arg_info(op->args[3])->val == 0
-                       && arg_is_const(op->args[4])
-                       && arg_info(op->args[4])->val == 0) {
+                continue;
+            }
+            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
+                 && arg_is_const(op->args[3])
+                 && arg_info(op->args[3])->val == 0
+                 && arg_is_const(op->args[4])
+                 && arg_info(op->args[4])->val == 0) {
                 /* Simplify LT/GE comparisons vs zero to a single compare
                    vs the high word of the input.  */
             do_setcond_high:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->args[1] = op->args[2];
                 op->args[2] = op->args[4];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_EQ) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 if (tmp == 0) {
                     goto do_setcond_high;
                 } else if (tmp != 1) {
-                    goto do_default;
+                    break;
                 }
             do_setcond_low:
                 reset_temp(op->args[0]);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = INDEX_op_setcond_i32;
                 op->args[2] = op->args[3];
                 op->args[3] = op->args[5];
-            } else if (op->args[5] == TCG_COND_NE) {
+                break;
+            }
+            if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
                 tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 } else if (tmp == 1) {
                     goto do_setcond_const;
                 }
-                goto do_default;
-            } else {
-                goto do_default;
             }
             break;
 
-        case INDEX_op_call:
-            if (!(tcg_call_flags(op)
+        default:
+            break;
+        }
+
+        /* Some of the folding above can change opc. */
+        opc = op->opc;
+        def = &tcg_op_defs[opc];
+        if (def->flags & TCG_OPF_BB_END) {
+            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
+        } else {
+            if (opc == INDEX_op_call &&
+                !(tcg_call_flags(op)
                   & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
                 for (i = 0; i < nb_globals; i++) {
                     if (test_bit(i, ctx.temps_used.l)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     }
                 }
             }
-            goto do_reset_output;
 
-        default:
-        do_default:
-            /* Default case: we know nothing about operation (or were unable
-               to compute the operation result) so no propagation is done.
-               We trash everything if the operation is the end of a basic
-               block, otherwise we only trash the output args.  "z_mask" is
-               the non-zero bits mask for the first output arg.  */
-            if (def->flags & TCG_OPF_BB_END) {
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-            } else {
-        do_reset_output:
-                for (i = 0; i < nb_oargs; i++) {
-                    reset_temp(op->args[i]);
-                    /* Save the corresponding known-zero bits mask for the
-                       first output argument (only one supported so far). */
-                    if (i == 0) {
-                        arg_info(op->args[i])->z_mask = z_mask;
-                    }
+            for (i = 0; i < nb_oargs; i++) {
+                reset_temp(op->args[i]);
+                /* Save the corresponding known-zero bits mask for the
+                   first output argument (only one supported so far). */
+                if (i == 0) {
+                    arg_info(op->args[i])->z_mask = z_mask;
                 }
             }
-            break;
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-- 
2.25.1

Adjust the interface to take the OptContext parameter instead
of TCGContext or both.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 67 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 } TempOptInfo;
 
 typedef struct OptContext {
+    TCGContext *tcg;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
+static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
-        tcg_op_remove(s, op);
+        tcg_op_remove(ctx->tcg, op);
         return;
     }
 
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
     }
 }
 
-static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
-                             TCGOp *op, TCGArg dst, uint64_t val)
+static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+                             TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
     TCGType type;
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(TCGContext *s, OptContext *ctx,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(s, op, dst, temp_arg(tv));
+    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
     TCGOp *op, *op_next, *prev_mb = NULL;
-    OptContext ctx = {};
+    OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(rotr):
             if (arg_is_const(op->args[1])
                 && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (!arg_is_const(op->args[1])
                 && arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         if (partmask == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
             tcg_debug_assert(nb_oargs == 1);
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulsh):
             if (arg_is_const(op->args[2])
                 && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(or):
         CASE_OP_32_64_VEC(and):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
         CASE_OP_32_64_VEC(xor):
             if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], 0);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
+            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
                 tmp = arg_info(op->args[1])->val;
                 tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             assert(TCG_TARGET_REG_BITS == 32);
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0],
+                tcg_opt_gen_movi(&ctx, op, op->args[0],
                                  deposit64(arg_info(op->args[1])->val, 32, 32,
                                            arg_info(op->args[2])->val));
                 continue;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           op->args[2]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
                 tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
                                           arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGArg v = arg_info(op->args[1])->val;
                 if (v != 0) {
                     tmp = do_constant_folding(opc, v, 0);
-                    tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                    tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 } else {
-                    tcg_opt_gen_mov(s, op, op->args[0], op->args[2]);
+                    tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[2]);
                 }
                 continue;
             }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 tmp = deposit64(arg_info(op->args[1])->val,
                                 op->args[3], op->args[4],
                                 arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = extract64(arg_info(op->args[1])->val,
                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (arg_is_const(op->args[1])) {
                 tmp = sextract64(arg_info(op->args[1])->val,
                                  op->args[2], op->args[3]);
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     tmp = (int32_t)(((uint32_t)v1 >> shr) |
                                     ((uint32_t)v2 << (32 - shr)));
                 }
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[3]);
             if (tmp != 2) {
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             tmp = do_constant_folding_cond(opc, op->args[1],
                                            op->args[2], op->args[5]);
             if (tmp != 2) {
-                tcg_opt_gen_mov(s, op, op->args[0], op->args[4-tmp]);
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(a >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
                 rl = op->args[0];
                 rh = op->args[1];
-                tcg_opt_gen_movi(s, &ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(s, &ctx, op2, rh, (int32_t)(r >> 32));
+                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
+                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
                 continue;
             }
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                                             op->args[5]);
             if (tmp != 2) {
             do_setcond_const:
-                tcg_opt_gen_movi(s, &ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-- 
2.25.1

This will expose the variable to subroutines that
will be broken out of tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
 
 typedef struct OptContext {
     TCGContext *tcg;
+    TCGOp *prev_mb;
     TCGTempSet temps_used;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int nb_temps, nb_globals, i;
-    TCGOp *op, *op_next, *prev_mb = NULL;
+    TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
     /* Array VALS has an element for each temp.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         /* Eliminate duplicate and redundant fence instructions.  */
-        if (prev_mb) {
+        if (ctx.prev_mb) {
             switch (opc) {
             case INDEX_op_mb:
                 /* Merge two barriers of the same type into one,
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                  * barrier.  This is stricter than specified but for
                  * the purposes of TCG is better than not optimizing.
                  */
-                prev_mb->args[0] |= op->args[0];
+                ctx.prev_mb->args[0] |= op->args[0];
                 tcg_op_remove(s, op);
                 break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
-                prev_mb = NULL;
+                ctx.prev_mb = NULL;
                 break;
             }
         } else if (opc == INDEX_op_mb) {
-            prev_mb = op;
+            ctx.prev_mb = op;
         }
     }
 }
-- 
2.25.1

There was no real reason for calls to have separate code here.
Unify init for calls vs non-calls using the call path, which
handles TCG_CALL_DUMMY_ARG.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
     }
 }
 
-static void init_arg_info(OptContext *ctx, TCGArg arg)
-{
-    init_ts_info(ctx, arg_temp(arg));
-}
-
 static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
 {
     TCGTemp *i, *g, *l;
@@ -XXX,XX +XXX,XX @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
     return false;
 }
 
+static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
+{
+    for (int i = 0; i < nb_args; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts) {
+            init_ts_info(ctx, ts);
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (opc == INDEX_op_call) {
             nb_oargs = TCGOP_CALLO(op);
             nb_iargs = TCGOP_CALLI(op);
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                TCGTemp *ts = arg_temp(op->args[i]);
-                if (ts) {
-                    init_ts_info(&ctx, ts);
-                }
-            }
         } else {
             nb_oargs = def->nb_oargs;
             nb_iargs = def->nb_iargs;
-            for (i = 0; i < nb_oargs + nb_iargs; i++) {
-                init_arg_info(&ctx, op->args[i]);
-            }
         }
+        init_arguments(&ctx, op, nb_oargs + nb_iargs);
 
         /* Do copy propagation */
         for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-- 
2.25.1

Continue splitting tcg_optimize.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_arguments(OptContext *ctx, TCGOp *op, int nb_args)
     }
 }
 
+static void copy_propagate(OptContext *ctx, TCGOp *op,
+                           int nb_oargs, int nb_iargs)
+{
+    TCGContext *s = ctx->tcg;
+
+    for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+        TCGTemp *ts = arg_temp(op->args[i]);
+        if (ts && ts_is_copy(ts)) {
+            op->args[i] = temp_arg(find_better_copy(s, ts));
+        }
+    }
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             nb_iargs = def->nb_iargs;
         }
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
-
-        /* Do copy propagation */
-        for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-            TCGTemp *ts = arg_temp(op->args[i]);
-            if (ts && ts_is_copy(ts)) {
-                op->args[i] = temp_arg(find_better_copy(s, ts));
-            }
-        }
+        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
-- 
2.25.1

Calls are special in that they have a variable number
of arguments, and need to be able to clobber globals.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static bool fold_call(OptContext *ctx, TCGOp *op)
+{
+    TCGContext *s = ctx->tcg;
+    int nb_oargs = TCGOP_CALLO(op);
+    int nb_iargs = TCGOP_CALLI(op);
+    int flags, i;
+
+    init_arguments(ctx, op, nb_oargs + nb_iargs);
+    copy_propagate(ctx, op, nb_oargs, nb_iargs);
+
+    /* If the function reads or writes globals, reset temp data. */
+    flags = tcg_call_flags(op);
+    if (!(flags & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
+        int nb_globals = s->nb_globals;
+
+        for (i = 0; i < nb_globals; i++) {
+            if (test_bit(i, ctx->temps_used.l)) {
+                reset_ts(&ctx->tcg->temps[i]);
+            }
+        }
+    }
+
+    /* Reset temp data for outputs. */
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+    }
+
+    /* Stop optimizing MB across calls. */
+    ctx->prev_mb = NULL;
+    return true;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
-    int nb_temps, nb_globals, i;
+    int nb_temps, i;
     TCGOp *op, *op_next;
     OptContext ctx = { .tcg = s };
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
        available through the doubly linked circular list. */
 
     nb_temps = s->nb_temps;
-    nb_globals = s->nb_globals;
-
     for (i = 0; i < nb_temps; ++i) {
         s->temps[i].state_ptr = NULL;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
 
-        /* Count the arguments, and initialize the temps that are
-           going to be used */
+        /* Calls are special. */
         if (opc == INDEX_op_call) {
-            nb_oargs = TCGOP_CALLO(op);
-            nb_iargs = TCGOP_CALLI(op);
-        } else {
-            nb_oargs = def->nb_oargs;
-            nb_iargs = def->nb_iargs;
+            fold_call(&ctx, op);
+            continue;
         }
+
+        def = &tcg_op_defs[opc];
+        nb_oargs = def->nb_oargs;
+        nb_iargs = def->nb_iargs;
         init_arguments(&ctx, op, nb_oargs + nb_iargs);
         copy_propagate(&ctx, op, nb_oargs, nb_iargs);
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
-            if (opc == INDEX_op_call &&
-                !(tcg_call_flags(op)
-                  & (TCG_CALL_NO_READ_GLOBALS | TCG_CALL_NO_WRITE_GLOBALS))) {
-                for (i = 0; i < nb_globals; i++) {
-                    if (test_bit(i, ctx.temps_used.l)) {
-                        reset_ts(&s->temps[i]);
-                    }
-                }
-            }
-
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_st_i32:
             case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
-            case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
                 ctx.prev_mb = NULL;
                 break;
-- 
2.25.1

Rather than try to keep these up-to-date across folding,
re-read nb_oargs at the end, after re-reading the opcode.

A couple of asserts need dropping, but that will take care
of itself as we split the function further.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
         uint64_t z_mask, partmask, affected, tmp;
-        int nb_oargs, nb_iargs;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         def = &tcg_op_defs[opc];
-        nb_oargs = def->nb_oargs;
-        nb_iargs = def->nb_iargs;
-        init_arguments(&ctx, op, nb_oargs + nb_iargs);
-        copy_propagate(&ctx, op, nb_oargs, nb_iargs);
+        init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
+        copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
         /* For commutative operations make constant second argument */
         switch (opc) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
 
         CASE_OP_32_64(qemu_ld):
             {
-                MemOpIdx oi = op->args[nb_oargs + nb_iargs];
+                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
                 MemOp mop = get_memop(oi);
                 if (!(mop & MO_SIGN)) {
                     z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         }
 
         if (partmask == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
             continue;
         }
         if (affected == 0) {
-            tcg_debug_assert(nb_oargs == 1);
             tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             continue;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             } else if (args_are_copies(op->args[1], op->args[2])) {
                 op->opc = INDEX_op_dup_vec;
                 TCGOP_VECE(op) = MO_32;
-                nb_iargs = 1;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 op->opc = opc = (opc == INDEX_op_movcond_i32
                                  ? INDEX_op_setcond_i32
                                  : INDEX_op_setcond_i64);
-                nb_iargs = 2;
             }
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (def->flags & TCG_OPF_BB_END) {
             memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
         } else {
+            int nb_oargs = def->nb_oargs;
             for (i = 0; i < nb_oargs; i++) {
                 reset_temp(op->args[i]);
                 /* Save the corresponding known-zero bits mask for the
-- 
2.25.1

Return -1 instead of 2 for failure, so that we can
use comparisons against 0 for all cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 +++++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
     }
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
-                                       TCGArg y, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+                                    TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
         case TCG_COND_GEU:
             return 1;
         default:
-            return 2;
+            return -1;
         }
     }
-    return 2;
+    return -1;
 }
 
-/* Return 2 if the condition can't be simplified, and the result
-   of the condition (0 or 1) if it can */
-static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
+/*
+ * Return -1 if the condition can't be simplified,
+ * and the result of the condition (0 or 1) if it can.
+ */
+static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
 {
     TCGArg al = p1[0], ah = p1[1];
     TCGArg bl = p2[0], bh = p2[1];
@@ -XXX,XX +XXX,XX @@ static TCGArg do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     if (args_are_copies(al, bl) && args_are_copies(ah, bh)) {
         return do_constant_folding_cond_eq(c);
     }
-    return 2;
+    return -1;
 }
 
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(setcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[3]);
-            if (tmp != 2) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[3]);
+            if (i >= 0) {
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             break;
 
         CASE_OP_32_64(brcond):
-            tmp = do_constant_folding_cond(opc, op->args[0],
-                                           op->args[1], op->args[2]);
-            switch (tmp) {
-            case 0:
+            i = do_constant_folding_cond(opc, op->args[0],
+                                         op->args[1], op->args[2]);
+            if (i == 0) {
                 tcg_op_remove(s, op);
                 continue;
-            case 1:
+            } else if (i > 0) {
                 memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[3];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         CASE_OP_32_64(movcond):
-            tmp = do_constant_folding_cond(opc, op->args[1],
-                                           op->args[2], op->args[5]);
-            if (tmp != 2) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4-tmp]);
+            i = do_constant_folding_cond(opc, op->args[1],
+                                         op->args[2], op->args[5]);
+            if (i >= 0) {
+                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
                 continue;
             }
             if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
 
         case INDEX_op_brcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                            op->args[4]);
-            if (tmp == 0) {
+            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
+                                          op->args[4]);
+            if (i == 0) {
             do_brcond_false:
                 tcg_op_remove(s, op);
                 continue;
             }
-            if (tmp == 1) {
+            if (i > 0) {
             do_brcond_true:
                 op->opc = opc = INDEX_op_br;
                 op->args[0] = op->args[5];
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_brcond_false;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_brcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[4] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[0], op->args[2],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[0], op->args[2],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_brcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_brcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_brcond_true;
                 }
             }
             break;
 
         case INDEX_op_setcond2_i32:
-            tmp = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                            op->args[5]);
-            if (tmp != 2) {
+            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
+                                          op->args[5]);
+            if (i >= 0) {
             do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
+                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
                 continue;
             }
             if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_EQ) {
                 /* Simplify EQ comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_const;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_high;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_EQ);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_EQ);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp != 1) {
+                } else if (i < 0) {
                     break;
                 }
             do_setcond_low:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             if (op->args[5] == TCG_COND_NE) {
                 /* Simplify NE comparisons where one of the pairs
                    can be simplified.  */
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[1], op->args[3],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[1], op->args[3],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_high;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
-                tmp = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                               op->args[2], op->args[4],
-                                               TCG_COND_NE);
-                if (tmp == 0) {
+                i = do_constant_folding_cond(INDEX_op_setcond_i32,
+                                             op->args[2], op->args[4],
+                                             TCG_COND_NE);
+                if (i == 0) {
                     goto do_setcond_low;
-                } else if (tmp == 1) {
+                } else if (i > 0) {
                     goto do_setcond_const;
                 }
             }
-- 
2.25.1

This will allow callers to tail call to these functions
and return true indicating processing complete.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
     return ts_are_copies(arg_temp(arg1), arg_temp(arg2));
 }
 
-static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
+static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 
     if (ts_are_copies(dst_ts, src_ts)) {
         tcg_op_remove(ctx->tcg, op);
-        return;
+        return true;
     }
 
     reset_ts(dst_ts);
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
         di->is_const = si->is_const;
         di->val = si->val;
     }
+    return true;
 }
 
-static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
+static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
     const TCGOpDef *def = &tcg_op_defs[op->opc];
@@ -XXX,XX +XXX,XX @@ static void tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
     /* Convert movi to mov with constant temp. */
     tv = tcg_constant_internal(type, val);
     init_ts_info(ctx, tv);
-    tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
+    return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
 
 static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
-- 
2.25.1

Copy z_mask into OptContext, for writeback to the
first output within the new function.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGContext *tcg;
     TCGOp *prev_mb;
     TCGTempSet temps_used;
+
+    /* In flight values from optimization. */
+    uint64_t z_mask;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static void copy_propagate(OptContext *ctx, TCGOp *op,
     }
 }
 
+static void finish_folding(OptContext *ctx, TCGOp *op)
+{
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    int i, nb_oargs;
+
+    /*
+     * For an opcode that ends a BB, reset all temp data.
+     * We do no cross-BB optimization.
+     */
+    if (def->flags & TCG_OPF_BB_END) {
+        memset(&ctx->temps_used, 0, sizeof(ctx->temps_used));
+        ctx->prev_mb = NULL;
+        return;
+    }
+
+    nb_oargs = def->nb_oargs;
+    for (i = 0; i < nb_oargs; i++) {
+        reset_temp(op->args[i]);
+        /*
+         * Save the corresponding known-zero bits mask for the
+         * first output argument (only one supported so far).
+         */
+        if (i == 0) {
+            arg_info(op->args[i])->z_mask = ctx->z_mask;
+        }
+    }
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
         }
+        ctx.z_mask = z_mask;
 
         if (partmask == 0) {
             tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Some of the folding above can change opc. */
-        opc = op->opc;
-        def = &tcg_op_defs[opc];
-        if (def->flags & TCG_OPF_BB_END) {
-            memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-        } else {
-            int nb_oargs = def->nb_oargs;
-            for (i = 0; i < nb_oargs; i++) {
-                reset_temp(op->args[i]);
-                /* Save the corresponding known-zero bits mask for the
-                   first output argument (only one supported so far). */
-                if (i == 0) {
-                    arg_info(op->args[i])->z_mask = z_mask;
-                }
-            }
-        }
+        finish_folding(&ctx, op);
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
+        bool done = false;
 
         /* Calls are special. */
         if (opc == INDEX_op_call) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            allocator where needed and possible.  Also detect copies. */
         switch (opc) {
         CASE_OP_32_64_VEC(mov):
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
+            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
+            break;
 
         case INDEX_op_dup_vec:
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        finish_folding(&ctx, op);
+        if (!done) {
+            finish_folding(&ctx, op);
+        }
 
         /* Eliminate duplicate and redundant fence instructions.  */
         if (ctx.prev_mb) {
-- 
2.25.1

This puts the separate mb optimization into the same framework
as the others.  While fold_qemu_{ld,st} are currently identical,
that won't last as more code gets moved.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 +++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mb(OptContext *ctx, TCGOp *op)
+{
+    /* Eliminate duplicate and redundant fence instructions.  */
+    if (ctx->prev_mb) {
+        /*
+         * Merge two barriers of the same type into one,
+         * or a weaker barrier into a stronger one,
+         * or two weaker barriers into a stronger one.
+         *   mb X; mb Y => mb X|Y
+         *   mb; strl => mb; st
+         *   ldaq; mb => ld; mb
+         *   ldaq; strl => ld; mb; st
+         * Other combinations are also merged into a strong
+         * barrier.  This is stricter than specified but for
+         * the purposes of TCG is better than not optimizing.
+         */
+        ctx->prev_mb->args[0] |= op->args[0];
+        tcg_op_remove(ctx->tcg, op);
+    } else {
+        ctx->prev_mb = op;
+    }
+    return true;
+}
+
+static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
+static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
+{
+    /* Opcodes that touch guest memory stop the mb optimization.  */
+    ctx->prev_mb = NULL;
+    return false;
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        case INDEX_op_mb:
+            done = fold_mb(&ctx, op);
+            break;
+        case INDEX_op_qemu_ld_i32:
+        case INDEX_op_qemu_ld_i64:
+            done = fold_qemu_ld(&ctx, op);
+            break;
+        case INDEX_op_qemu_st_i32:
+        case INDEX_op_qemu_st8_i32:
+        case INDEX_op_qemu_st_i64:
+            done = fold_qemu_st(&ctx, op);
+            break;
+
         default:
             break;
         }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         if (!done) {
             finish_folding(&ctx, op);
         }
-
-        /* Eliminate duplicate and redundant fence instructions.  */
-        if (ctx.prev_mb) {
-            switch (opc) {
-            case INDEX_op_mb:
-                /* Merge two barriers of the same type into one,
-                 * or a weaker barrier into a stronger one,
-                 * or two weaker barriers into a stronger one.
-                 *   mb X; mb Y => mb X|Y
-                 *   mb; strl => mb; st
-                 *   ldaq; mb => ld; mb
-                 *   ldaq; strl => ld; mb; st
-                 * Other combinations are also merged into a strong
-                 * barrier.  This is stricter than specified but for
-                 * the purposes of TCG is better than not optimizing.
-                 */
-                ctx.prev_mb->args[0] |= op->args[0];
-                tcg_op_remove(s, op);
-                break;
-
-            default:
-                /* Opcodes that end the block stop the optimization.  */
-                if ((def->flags & TCG_OPF_BB_END) == 0) {
-                    break;
-                }
-                /* fallthru */
-            case INDEX_op_qemu_ld_i32:
-            case INDEX_op_qemu_ld_i64:
-            case INDEX_op_qemu_st_i32:
-            case INDEX_op_qemu_st8_i32:
-            case INDEX_op_qemu_st_i64:
-                /* Opcodes that touch guest memory stop the optimization.  */
-                ctx.prev_mb = NULL;
-                break;
-            }
-        } else if (opc == INDEX_op_mb) {
-            ctx.prev_mb = op;
-        }
     }
 }
-- 
2.25.1

Split out a whole bunch of placeholder functions, which are
currently identical.  That won't last as more code gets moved.

Use CASE_32_64_VEC for some logical operators that previously
missed the addition of vectors.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 271 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 219 insertions(+), 52 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
     }
 }
 
+/*
+ * The fold_* functions return true when processing is complete,
+ * usually by folding the operation to a constant or to a copy,
+ * and calling tcg_opt_gen_{mov,movi}.  They may do other things,
+ * like collect information about the value produced, for use in
+ * optimizing a subsequent operation.
+ *
+ * These first fold_* functions are all helpers, used by other
+ * folders for more specific operations.
+ */
+
+static bool fold_const1(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = do_constant_folding(op->opc, t, 0);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_const2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = do_constant_folding(op->opc, t1, t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
+/*
+ * These outermost fold_<op> functions are sorted alphabetically.
+ */
+
+static bool fold_add(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_and(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_andc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_ctpop(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_divide(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_eqv(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_exts(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_extu(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
 static bool fold_mb(OptContext *ctx, TCGOp *op)
 {
     /* Eliminate duplicate and redundant fence instructions.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mul(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_nand(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_neg(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_nor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_not(OptContext *ctx, TCGOp *op)
+{
+    return fold_const1(ctx, op);
+}
+
+static bool fold_or(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_orc(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_st(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_remainder(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_shift(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
+static bool fold_xor(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op);
+}
+
 /* Propagate constants and copies, fold constant expressions. */
 void tcg_optimize(TCGContext *s)
 {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(not):
-        CASE_OP_32_64(neg):
-        CASE_OP_32_64(ext8s):
-        CASE_OP_32_64(ext8u):
-        CASE_OP_32_64(ext16s):
-        CASE_OP_32_64(ext16u):
-        CASE_OP_32_64(ctpop):
-        case INDEX_op_ext32s_i64:
-        case INDEX_op_ext32u_i64:
-        case INDEX_op_ext_i32_i64:
-        case INDEX_op_extu_i32_i64:
-        case INDEX_op_extrl_i64_i32:
-        case INDEX_op_extrh_i64_i32:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val, 0);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(bswap16):
         CASE_OP_32_64(bswap32):
         case INDEX_op_bswap64_i64:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(add):
-        CASE_OP_32_64(sub):
-        CASE_OP_32_64(mul):
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(and):
-        CASE_OP_32_64(xor):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-        CASE_OP_32_64(andc):
-        CASE_OP_32_64(orc):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-        CASE_OP_32_64(div):
-        CASE_OP_32_64(divu):
-        CASE_OP_32_64(rem):
-        CASE_OP_32_64(remu):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             if (arg_is_const(op->args[1])) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
+        default:
+            break;
+
+        /* ---------------------------------------------------------- */
+        /* Sorted alphabetically by opcode as much as possible. */
+
+        CASE_OP_32_64_VEC(add):
+            done = fold_add(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(and):
+            done = fold_and(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(andc):
+            done = fold_andc(&ctx, op);
+            break;
+        CASE_OP_32_64(ctpop):
+            done = fold_ctpop(&ctx, op);
+            break;
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+            done = fold_divide(&ctx, op);
+            break;
+        CASE_OP_32_64(eqv):
+            done = fold_eqv(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8s):
+        CASE_OP_32_64(ext16s):
+        case INDEX_op_ext32s_i64:
+        case INDEX_op_ext_i32_i64:
+            done = fold_exts(&ctx, op);
+            break;
+        CASE_OP_32_64(ext8u):
+        CASE_OP_32_64(ext16u):
+        case INDEX_op_ext32u_i64:
+        case INDEX_op_extu_i32_i64:
+        case INDEX_op_extrl_i64_i32:
+        case INDEX_op_extrh_i64_i32:
+            done = fold_extu(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(mul):
+            done = fold_mul(&ctx, op);
+            break;
+        CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(muluh):
+            done = fold_mul_highpart(&ctx, op);
+            break;
+        CASE_OP_32_64(nand):
+            done = fold_nand(&ctx, op);
+            break;
+        CASE_OP_32_64(neg):
+            done = fold_neg(&ctx, op);
+            break;
+        CASE_OP_32_64(nor):
+            done = fold_nor(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(not):
+            done = fold_not(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(or):
+            done = fold_or(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(orc):
+            done = fold_orc(&ctx, op);
+            break;
         case INDEX_op_qemu_ld_i32:
         case INDEX_op_qemu_ld_i64:
             done = fold_qemu_ld(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_qemu_st_i64:
             done = fold_qemu_st(&ctx, op);
             break;
-
-        default:
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
+            done = fold_remainder(&ctx, op);
+            break;
+        CASE_OP_32_64(rotl):
+        CASE_OP_32_64(rotr):
+        CASE_OP_32_64(sar):
+        CASE_OP_32_64(shl):
+        CASE_OP_32_64(shr):
+            done = fold_shift(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(sub):
+            done = fold_sub(&ctx, op);
+            break;
+        CASE_OP_32_64_VEC(xor):
+            done = fold_xor(&ctx, op);
             break;
         }
 
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 145 ++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_setcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_setcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0 &&
+            arg_is_const(op->args[4]) && arg_info(op->args[4])->val == 0) {
+            goto do_setcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            goto do_setcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+                                     op->args[4], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_setcond_const;
+        case 1:
+            op->args[2] = op->args[3];
+            op->args[3] = cond;
+            op->opc = INDEX_op_setcond_i32;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_setcond_high:
+        op->args[1] = op->args[2];
+        op->args[2] = op->args[4];
+        op->args[3] = cond;
+        op->opc = INDEX_op_setcond_i32;
+        break;
+    }
+    return false;
+
+ do_setcond_const:
+    return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_setcond2_i32:
-            i = do_constant_folding_cond2(&op->args[1], &op->args[3],
-                                          op->args[5]);
-            if (i >= 0) {
-            do_setcond_const:
-                tcg_opt_gen_movi(&ctx, op, op->args[0], i);
-                continue;
-            }
-            if ((op->args[5] == TCG_COND_LT || op->args[5] == TCG_COND_GE)
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0
-                 && arg_is_const(op->args[4])
-                 && arg_info(op->args[4])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_setcond_high:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_const;
-                } else if (i > 0) {
-                    goto do_setcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i < 0) {
-                    break;
-                }
-            do_setcond_low:
-                reset_temp(op->args[0]);
-                arg_info(op->args[0])->z_mask = 1;
-                op->opc = INDEX_op_setcond_i32;
-                op->args[2] = op->args[3];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[5] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_high;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-                i = do_constant_folding_cond(INDEX_op_setcond_i32,
-                                             op->args[2], op->args[4],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_setcond_low;
-                } else if (i > 0) {
-                    goto do_setcond_const;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(shr):
             done = fold_shift(&ctx, op);
             break;
+        case INDEX_op_setcond2_i32:
+            done = fold_setcond2(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reduce some code duplication by folding the NE and EQ cases.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 159 +++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 78 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_brcond2(OptContext *ctx, TCGOp *op)
+{
+    TCGCond cond = op->args[4];
+    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
+    TCGArg label = op->args[5];
+    int inv = 0;
+
+    if (i >= 0) {
+        goto do_brcond_const;
+    }
+
+    switch (cond) {
+    case TCG_COND_LT:
+    case TCG_COND_GE:
+        /*
+         * Simplify LT/GE comparisons vs zero to a single compare
+         * vs the high word of the input.
+         */
+        if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0 &&
+            arg_is_const(op->args[3]) && arg_info(op->args[3])->val == 0) {
+            goto do_brcond_high;
+        }
+        break;
+
+    case TCG_COND_NE:
+        inv = 1;
+        QEMU_FALLTHROUGH;
+    case TCG_COND_EQ:
+        /*
+         * Simplify EQ/NE comparisons where one of the pairs
+         * can be simplified.
+         */
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+                                     op->args[2], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            goto do_brcond_high;
+        }
+
+        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+                                     op->args[3], cond);
+        switch (i ^ inv) {
+        case 0:
+            goto do_brcond_const;
+        case 1:
+            op->opc = INDEX_op_brcond_i32;
+            op->args[1] = op->args[2];
+            op->args[2] = cond;
+            op->args[3] = label;
+            break;
+        }
+        break;
+
+    default:
+        break;
+
+    do_brcond_high:
+        op->opc = INDEX_op_brcond_i32;
+        op->args[0] = op->args[1];
+        op->args[1] = op->args[3];
+        op->args[2] = cond;
+        op->args[3] = label;
+        break;
+
+    do_brcond_const:
+        if (i == 0) {
+            tcg_op_remove(ctx->tcg, op);
+            return true;
+        }
+        op->opc = INDEX_op_br;
+        op->args[0] = label;
+        break;
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_brcond2_i32:
-            i = do_constant_folding_cond2(&op->args[0], &op->args[2],
-                                          op->args[4]);
-            if (i == 0) {
-            do_brcond_false:
-                tcg_op_remove(s, op);
-                continue;
-            }
-            if (i > 0) {
-            do_brcond_true:
-                op->opc = opc = INDEX_op_br;
-                op->args[0] = op->args[5];
-                break;
-            }
-            if ((op->args[4] == TCG_COND_LT || op->args[4] == TCG_COND_GE)
-                 && arg_is_const(op->args[2])
-                 && arg_info(op->args[2])->val == 0
-                 && arg_is_const(op->args[3])
-                 && arg_info(op->args[3])->val == 0) {
-                /* Simplify LT/GE comparisons vs zero to a single compare
-                   vs the high word of the input.  */
-            do_brcond_high:
-                op->opc = opc = INDEX_op_brcond_i32;
-                op->args[0] = op->args[1];
-                op->args[1] = op->args[3];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_EQ) {
-                /* Simplify EQ comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i > 0) {
-                    goto do_brcond_high;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_EQ);
-                if (i == 0) {
-                    goto do_brcond_false;
-                } else if (i < 0) {
-                    break;
-                }
-            do_brcond_low:
-                memset(&ctx.temps_used, 0, sizeof(ctx.temps_used));
-                op->opc = INDEX_op_brcond_i32;
-                op->args[1] = op->args[2];
-                op->args[2] = op->args[4];
-                op->args[3] = op->args[5];
-                break;
-            }
-            if (op->args[4] == TCG_COND_NE) {
-                /* Simplify NE comparisons where one of the pairs
-                   can be simplified.  */
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[0], op->args[2],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_high;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-                i = do_constant_folding_cond(INDEX_op_brcond_i32,
-                                             op->args[1], op->args[3],
-                                             TCG_COND_NE);
-                if (i == 0) {
-                    goto do_brcond_low;
-                } else if (i > 0) {
-                    goto do_brcond_true;
-                }
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(andc):
             done = fold_andc(&ctx, op);
             break;
+        case INDEX_op_brcond2_i32:
+            done = fold_brcond2(&ctx, op);
+            break;
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
+        uint32_t a = arg_info(op->args[2])->val;
+        uint32_t b = arg_info(op->args[3])->val;
+        uint64_t r = (uint64_t)a * b;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+        return true;
+    }
+    return false;
+}
+
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_mulu2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-                uint32_t a = arg_info(op->args[2])->val;
-                uint32_t b = arg_info(op->args[3])->val;
-                uint64_t r = (uint64_t)a * b;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)r);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(r >> 32));
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
+        case INDEX_op_mulu2_i32:
+            done = fold_mulu2_i32(&ctx, op);
+            break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
             break;
-- 
2.25.1

Add two additional helpers, fold_add2_i32 and fold_sub2_i32
which will not be simple wrappers forever.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 70 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+{
+    if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
+        arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
+        uint32_t al = arg_info(op->args[2])->val;
+        uint32_t ah = arg_info(op->args[3])->val;
+        uint32_t bl = arg_info(op->args[4])->val;
+        uint32_t bh = arg_info(op->args[5])->val;
+        uint64_t a = ((uint64_t)ah << 32) | al;
+        uint64_t b = ((uint64_t)bh << 32) | bl;
+        TCGArg rl, rh;
+        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+
+        if (add) {
+            a += b;
+        } else {
+            a -= b;
+        }
+
+        rl = op->args[0];
+        rh = op->args[1];
+        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
+        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+        return true;
+    }
+    return false;
+}
+
+static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, true);
+}
+
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+{
+    return fold_addsub2_i32(ctx, op, false);
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        case INDEX_op_add2_i32:
-        case INDEX_op_sub2_i32:
-            if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])
-                && arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-                uint32_t al = arg_info(op->args[2])->val;
-                uint32_t ah = arg_info(op->args[3])->val;
-                uint32_t bl = arg_info(op->args[4])->val;
-                uint32_t bh = arg_info(op->args[5])->val;
-                uint64_t a = ((uint64_t)ah << 32) | al;
-                uint64_t b = ((uint64_t)bh << 32) | bl;
-                TCGArg rl, rh;
-                TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_mov_i32);
-
-                if (opc == INDEX_op_add2_i32) {
-                    a += b;
-                } else {
-                    a -= b;
-                }
-
-                rl = op->args[0];
-                rh = op->args[1];
-                tcg_opt_gen_movi(&ctx, op, rl, (int32_t)a);
-                tcg_opt_gen_movi(&ctx, op2, rh, (int32_t)(a >> 32));
-                continue;
-            }
-            break;
 
         default:
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add2_i32:
+            done = fold_add2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub2_i32:
+            done = fold_sub2_i32(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_movcond(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode opc = op->opc;
+    TCGCond cond = op->args[5];
+    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+
+    if (i >= 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
+    }
+
+    if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
+        uint64_t tv = arg_info(op->args[3])->val;
+        uint64_t fv = arg_info(op->args[4])->val;
+
+        opc = (opc == INDEX_op_movcond_i32
+               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+
+        if (tv == 1 && fv == 0) {
+            op->opc = opc;
+            op->args[3] = cond;
+        } else if (fv == 1 && tv == 0) {
+            op->opc = opc;
+            op->args[3] = tcg_invert_cond(cond);
+        }
+    }
+    return false;
+}
+
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(movcond):
-            i = do_constant_folding_cond(opc, op->args[1],
-                                         op->args[2], op->args[5]);
-            if (i >= 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[4 - i]);
-                continue;
-            }
-            if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
-                uint64_t tv = arg_info(op->args[3])->val;
-                uint64_t fv = arg_info(op->args[4])->val;
-                TCGCond cond = op->args[5];
-
-                if (fv == 1 && tv == 0) {
-                    cond = tcg_invert_cond(cond);
-                } else if (!(tv == 1 && fv == 0)) {
-                    break;
-                }
-                op->args[3] = cond;
-                op->opc = opc = (opc == INDEX_op_movcond_i32
-                                 ? INDEX_op_setcond_i32
-                                 : INDEX_op_setcond_i64);
-            }
-            break;
-
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64(movcond):
+            done = fold_movcond(&ctx, op);
+            break;
         CASE_OP_32_64(mul):
             done = fold_mul(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t v1 = arg_info(op->args[1])->val;
+        uint64_t v2 = arg_info(op->args[2])->val;
+        int shr = op->args[3];
+
+        if (op->opc == INDEX_op_extract2_i64) {
+            v1 >>= shr;
+            v2 <<= 64 - shr;
+        } else {
+            v1 = (uint32_t)v1 >> shr;
+            v2 = (int32_t)v2 << (32 - shr);
+        }
+        return tcg_opt_gen_movi(ctx, op, op->args[0], v1 | v2);
+    }
+    return false;
+}
+
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
     return fold_const1(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract2):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                uint64_t v1 = arg_info(op->args[1])->val;
-                uint64_t v2 = arg_info(op->args[2])->val;
-                int shr = op->args[3];
-
-                if (opc == INDEX_op_extract2_i64) {
-                    tmp = (v1 >> shr) | (v2 << (64 - shr));
-                } else {
-                    tmp = (int32_t)(((uint32_t)v1 >> shr) |
-                                    ((uint32_t)v2 << (32 - shr)));
-                }
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract2):
+            done = fold_extract2(&ctx, op);
+            break;
         CASE_OP_32_64(ext8s):
         CASE_OP_32_64(ext16s):
         case INDEX_op_ext32s_i64:
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_extract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = extract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
     return tcg_opt_gen_movi(ctx, op, op->args[0], i);
 }
 
+static bool fold_sextract(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t;
+
+        t = arg_info(op->args[1])->val;
+        t = sextract64(t, op->args[2], op->args[3]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(extract):
-            if (arg_is_const(op->args[1])) {
-                tmp = extract64(arg_info(op->args[1])->val,
-                                op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        CASE_OP_32_64(sextract):
-            if (arg_is_const(op->args[1])) {
-                tmp = sextract64(arg_info(op->args[1])->val,
-                                 op->args[2], op->args[3]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
+        CASE_OP_32_64(extract):
+            done = fold_extract(&ctx, op);
+            break;
         CASE_OP_32_64(extract2):
             done = fold_extract2(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_setcond2_i32:
             done = fold_setcond2(&ctx, op);
             break;
+        CASE_OP_32_64(sextract):
+            done = fold_sextract(&ctx, op);
+            break;
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op)
     return fold_const1(ctx, op);
 }
 
+static bool fold_deposit(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t1 = arg_info(op->args[1])->val;
+        uint64_t t2 = arg_info(op->args[2])->val;
+
+        t1 = deposit64(t1, op->args[3], op->args[4], t2);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
+    }
+    return false;
+}
+
 static bool fold_divide(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(deposit):
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tmp = deposit64(arg_info(op->args[1])->val,
-                                op->args[3], op->args[4],
-                                arg_info(op->args[2])->val);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(ctpop):
             done = fold_ctpop(&ctx, op);
             break;
+        CASE_OP_32_64(deposit):
+            done = fold_deposit(&ctx, op);
+            break;
         CASE_OP_32_64(div):
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_bswap(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+
+        t = do_constant_folding(op->opc, t, op->args[2]);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
 static bool fold_call(OptContext *ctx, TCGOp *op)
 {
     TCGContext *s = ctx->tcg;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             }
             break;
 
-        CASE_OP_32_64(bswap16):
-        CASE_OP_32_64(bswap32):
-        case INDEX_op_bswap64_i64:
-            if (arg_is_const(op->args[1])) {
-                tmp = do_constant_folding(opc, arg_info(op->args[1])->val,
-                                          op->args[2]);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_brcond2_i32:
             done = fold_brcond2(&ctx, op);
             break;
+        CASE_OP_32_64(bswap16):
+        CASE_OP_32_64(bswap32):
+        case INDEX_op_bswap64_i64:
+            done = fold_bswap(&ctx, op);
+            break;
         CASE_OP_32_64(clz):
         CASE_OP_32_64(ctz):
             done = fold_count_zeros(&ctx, op);
-- 
2.25.1

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 53 +++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_divide(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_dup(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1])) {
+        uint64_t t = arg_info(op->args[1])->val;
+        t = dup_const(TCGOP_VECE(op), t);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+    return false;
+}
+
+static bool fold_dup2(OptContext *ctx, TCGOp *op)
+{
+    if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
+        uint64_t t = deposit64(arg_info(op->args[1])->val, 32, 32,
+                               arg_info(op->args[2])->val);
+        return tcg_opt_gen_movi(ctx, op, op->args[0], t);
+    }
+
+    if (args_are_copies(op->args[1], op->args[2])) {
+        op->opc = INDEX_op_dup_vec;
+        TCGOP_VECE(op) = MO_32;
+    }
+    return false;
+}
+
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     return fold_const2(ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
             break;
 
-        case INDEX_op_dup_vec:
-            if (arg_is_const(op->args[1])) {
-                tmp = arg_info(op->args[1])->val;
-                tmp = dup_const(TCGOP_VECE(op), tmp);
-                tcg_opt_gen_movi(&ctx, op, op->args[0], tmp);
-                continue;
-            }
-            break;
-
-        case INDEX_op_dup2_vec:
-            assert(TCG_TARGET_REG_BITS == 32);
-            if (arg_is_const(op->args[1]) && arg_is_const(op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0],
-                                 deposit64(arg_info(op->args[1])->val, 32, 32,
-                                           arg_info(op->args[2])->val));
-                continue;
-            } else if (args_are_copies(op->args[1], op->args[2])) {
-                op->opc = INDEX_op_dup_vec;
-                TCGOP_VECE(op) = MO_32;
-            }
-            break;
-
         default:
             break;
 
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(divu):
             done = fold_divide(&ctx, op);
             break;
+        case INDEX_op_dup_vec:
+            done = fold_dup(&ctx, op);
+            break;
+        case INDEX_op_dup2_vec:
+            done = fold_dup2(&ctx, op);
+            break;
         CASE_OP_32_64(eqv):
             done = fold_eqv(&ctx, op);
             break;
-- 
2.25.1

This is the final entry in the main switch that was in a
different form.  After this, we have the option to convert
the switch into a function dispatch table.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mb(OptContext *ctx, TCGOp *op)
     return true;
 }
 
+static bool fold_mov(OptContext *ctx, TCGOp *op)
+{
+    return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+}
+
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGOpcode opc = op->opc;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Propagate constants through copy operations and do constant
-           folding.  Constants will be substituted to arguments by register
-           allocator where needed and possible.  Also detect copies. */
+        /*
+         * Process each opcode.
+         * Sorted alphabetically by opcode as much as possible.
+         */
         switch (opc) {
-        CASE_OP_32_64_VEC(mov):
-            done = tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            break;
-
-        default:
-            break;
-
-        /* ---------------------------------------------------------- */
-        /* Sorted alphabetically by opcode as much as possible. */
-
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
+        CASE_OP_32_64_VEC(mov):
+            done = fold_mov(&ctx, op);
+            break;
         CASE_OP_32_64(movcond):
             done = fold_movcond(&ctx, op);
             break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
             break;
+        default:
+            break;
         }
 
         if (!done) {
-- 
2.25.1

Pull the "op r, a, a => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to @i. */
+static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
  */
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
 
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(xor):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, a => mov r, a" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has both arguments equal, fold to identity. */
+static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
+{
+    if (args_are_copies(op->args[1], op->args[2])) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * These outermost fold_<op> functions are sorted alphabetically.
+ *
+ * The ordering of the transformations should be:
+ *   1) those that produce a constant
+ *   2) those that produce a copy
+ *   3) those that produce information about the result value.
  */
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xx_to_x(ctx, op)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, a => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(and):
-            if (args_are_copies(op->args[1], op->args[2])) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Pull the "op r, a, 0 => movi r, 0" optimization into a function,
and use it in the outer opcode fold functions.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to @i. */
+static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_i(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             continue;
         }
 
-        /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            if (arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /*
          * Process each opcode.
          * Sorted alphabetically by opcode as much as possible.
-- 
2.25.1

Compute the type of the operation early.

There are at least 4 places that used a def->flags ladder
to determine the type of the operation being optimized.

There were two places that assumed !TCG_OPF_64BIT means
TCG_TYPE_I32, and so could potentially compute incorrect
results for vector operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 149 +++++++++++++++++++++++++++++--------------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
 
     /* In flight values from optimization. */
     uint64_t z_mask;
+    TCGType type;
 } OptContext;
 
 static inline TempOptInfo *ts_info(TCGTemp *ts)
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 {
     TCGTemp *dst_ts = arg_temp(dst);
     TCGTemp *src_ts = arg_temp(src);
-    const TCGOpDef *def;
     TempOptInfo *di;
     TempOptInfo *si;
     uint64_t z_mask;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     reset_ts(dst_ts);
     di = ts_info(dst_ts);
     si = ts_info(src_ts);
-    def = &tcg_op_defs[op->opc];
-    if (def->flags & TCG_OPF_VECTOR) {
-        new_op = INDEX_op_mov_vec;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        new_op = INDEX_op_mov_i64;
-    } else {
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
         new_op = INDEX_op_mov_i32;
+        break;
+    case TCG_TYPE_I64:
+        new_op = INDEX_op_mov_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
+        new_op = INDEX_op_mov_vec;
+        break;
+    default:
+        g_assert_not_reached();
     }
     op->opc = new_op;
-    /* TCGOP_VECL and TCGOP_VECE remain unchanged.  */
     op->args[0] = dst;
     op->args[1] = src;
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    const TCGOpDef *def = &tcg_op_defs[op->opc];
-    TCGType type;
-    TCGTemp *tv;
-
-    if (def->flags & TCG_OPF_VECTOR) {
-        type = TCGOP_VECL(op) + TCG_TYPE_V64;
-    } else if (def->flags & TCG_OPF_64BIT) {
-        type = TCG_TYPE_I64;
-    } else {
-        type = TCG_TYPE_I32;
-    }
-
     /* Convert movi to mov with constant temp. */
-    tv = tcg_constant_internal(type, val);
+    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     }
 }
 
-static uint64_t do_constant_folding(TCGOpcode op, uint64_t x, uint64_t y)
+static uint64_t do_constant_folding(TCGOpcode op, TCGType type,
+                                    uint64_t x, uint64_t y)
 {
-    const TCGOpDef *def = &tcg_op_defs[op];
     uint64_t res = do_constant_folding_2(op, x, y);
-    if (!(def->flags & TCG_OPF_64BIT)) {
+    if (type == TCG_TYPE_I32) {
         res = (int32_t)res;
     }
     return res;
@@ -XXX,XX +XXX,XX @@ static bool do_constant_folding_cond_eq(TCGCond c)
  * Return -1 if the condition can't be simplified,
  * and the result of the condition (0 or 1) if it can.
  */
-static int do_constant_folding_cond(TCGOpcode op, TCGArg x,
+static int do_constant_folding_cond(TCGType type, TCGArg x,
                                     TCGArg y, TCGCond c)
 {
     uint64_t xv = arg_info(x)->val;
     uint64_t yv = arg_info(y)->val;
 
     if (arg_is_const(x) && arg_is_const(y)) {
-        const TCGOpDef *def = &tcg_op_defs[op];
-        tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
-        if (def->flags & TCG_OPF_64BIT) {
-            return do_constant_folding_cond_64(xv, yv, c);
-        } else {
+        switch (type) {
+        case TCG_TYPE_I32:
             return do_constant_folding_cond_32(xv, yv, c);
+        case TCG_TYPE_I64:
+            return do_constant_folding_cond_64(xv, yv, c);
+        default:
+            /* Only scalar comparisons are optimizable */
+            return -1;
         }
     } else if (args_are_copies(x, y)) {
         return do_constant_folding_cond_eq(c);
@@ -XXX,XX +XXX,XX @@ static bool fold_const1(OptContext *ctx, TCGOp *op)
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = do_constant_folding(op->opc, t, 0);
+        t = do_constant_folding(op->opc, ctx->type, t, 0);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
         uint64_t t1 = arg_info(op->args[1])->val;
         uint64_t t2 = arg_info(op->args[2])->val;
 
-        t1 = do_constant_folding(op->opc, t1, t2);
+        t1 = do_constant_folding(op->opc, ctx->type, t1, t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(op->opc, op->args[0], op->args[1], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
 
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[0],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[0],
                                      op->args[2], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
             goto do_brcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_brcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
-        t = do_constant_folding(op->opc, t, op->args[2]);
+        t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         uint64_t t = arg_info(op->args[1])->val;
 
         if (t != 0) {
-            t = do_constant_folding(op->opc, t, 0);
+            t = do_constant_folding(op->opc, ctx->type, t, 0);
             return tcg_opt_gen_movi(ctx, op, op->args[0], t);
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
-    TCGOpcode opc = op->opc;
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
+        TCGOpcode opc;
 
-        opc = (opc == INDEX_op_movcond_i32
-               ? INDEX_op_setcond_i32 : INDEX_op_setcond_i64);
+        switch (ctx->type) {
+        case TCG_TYPE_I32:
+            opc = INDEX_op_setcond_i32;
+            break;
+        case TCG_TYPE_I64:
+            opc = INDEX_op_setcond_i64;
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         if (tv == 1 && fv == 0) {
             op->opc = opc;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(op->opc, op->args[1], op->args[2], cond);
+    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
 
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
          * Simplify EQ/NE comparisons where one of the pairs
          * can be simplified.
          */
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[1],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[1],
                                      op->args[3], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
             goto do_setcond_high;
         }
 
-        i = do_constant_folding_cond(INDEX_op_setcond_i32, op->args[2],
+        i = do_constant_folding_cond(TCG_TYPE_I32, op->args[2],
                                      op->args[4], cond);
         switch (i ^ inv) {
         case 0:
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         init_arguments(&ctx, op, def->nb_oargs + def->nb_iargs);
         copy_propagate(&ctx, op, def->nb_oargs, def->nb_iargs);
 
+        /* Pre-compute the type of the operation. */
+        if (def->flags & TCG_OPF_VECTOR) {
+            ctx.type = TCG_TYPE_V64 + TCGOP_VECL(op);
+        } else if (def->flags & TCG_OPF_64BIT) {
+            ctx.type = TCG_TYPE_I64;
+        } else {
+            ctx.type = TCG_TYPE_I32;
+        }
+
         /* For commutative operations make constant second argument */
         switch (opc) {
         CASE_OP_32_64_VEC(add):
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                     /* Proceed with possible constant folding. */
                     break;
                 }
-                if (opc == INDEX_op_sub_i32) {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     neg_op = INDEX_op_neg_i32;
                     have_neg = TCG_TARGET_HAS_neg_i32;
-                } else if (opc == INDEX_op_sub_i64) {
+                    break;
+                case TCG_TYPE_I64:
                     neg_op = INDEX_op_neg_i64;
                     have_neg = TCG_TARGET_HAS_neg_i64;
-                } else if (TCG_TARGET_HAS_neg_vec) {
-                    TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
-                    unsigned vece = TCGOP_VECE(op);
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
-                } else {
                     break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    neg_op = INDEX_op_neg_vec;
+                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
+                                                   TCGOP_VECE(op)) > 0;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_neg) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 TCGOpcode not_op;
                 bool have_not;
 
-                if (def->flags & TCG_OPF_VECTOR) {
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                } else if (def->flags & TCG_OPF_64BIT) {
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                } else {
+                switch (ctx.type) {
+                case TCG_TYPE_I32:
                     not_op = INDEX_op_not_i32;
                     have_not = TCG_TARGET_HAS_not_i32;
+                    break;
+                case TCG_TYPE_I64:
+                    not_op = INDEX_op_not_i64;
+                    have_not = TCG_TARGET_HAS_not_i64;
+                    break;
+                case TCG_TYPE_V64:
+                case TCG_TYPE_V128:
+                case TCG_TYPE_V256:
+                    not_op = INDEX_op_not_vec;
+                    have_not = TCG_TARGET_HAS_not_vec;
+                    break;
+                default:
+                    g_assert_not_reached();
                 }
                 if (!have_not) {
                     break;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
            below, we can ignore high bits, but for further optimizations we
            need to record that the high bits contain garbage.  */
         partmask = z_mask;
-        if (!(def->flags & TCG_OPF_64BIT)) {
+        if (ctx.type == TCG_TYPE_I32) {
             z_mask |= ~(tcg_target_ulong)0xffffffffu;
             partmask &= 0xffffffffu;
             affected &= 0xffffffffu;
-- 
2.25.1

Split out the conditional conversion from a more complex logical
operation to a simple NOT.  Create a couple more helpers to make
this easy for the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 158 +++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/*
+ * Convert @op to NOT, if NOT is supported by the host.
+ * Return true f the conversion is successful, which will still
+ * indicate that the processing is complete.
+ */
+static bool fold_not(OptContext *ctx, TCGOp *op);
+static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
+{
+    TCGOpcode not_op;
+    bool have_not;
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        not_op = INDEX_op_not_i32;
+        have_not = TCG_TARGET_HAS_not_i32;
+        break;
+    case TCG_TYPE_I64:
+        not_op = INDEX_op_not_i64;
+        have_not = TCG_TARGET_HAS_not_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        not_op = INDEX_op_not_vec;
+        have_not = TCG_TARGET_HAS_not_vec;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_not) {
+        op->opc = not_op;
+        op->args[1] = op->args[idx];
+        return fold_not(ctx, op);
+    }
+    return false;
+}
+
+/* If the binary operation has first argument @i, fold to NOT. */
+static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return fold_to_not(ctx, op, 2);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to @i. */
 static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to NOT. */
+static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return fold_to_not(ctx, op, 1);
+    }
+    return false;
+}
+
 /* If the binary operation has both arguments equal, fold to @i. */
 static bool fold_xx_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, -1)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_not(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    /* Because of fold_to_not, we want to always return true, via finish. */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_ix_to_not(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 }
             }
             break;
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(nand):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64(nor):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                i = 1;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(andc):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == -1) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[2])
-                && arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                i = 2;
-                goto try_not;
-            }
-            break;
-        try_not:
-            {
-                TCGOpcode not_op;
-                bool have_not;
-
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    not_op = INDEX_op_not_i32;
-                    have_not = TCG_TARGET_HAS_not_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    not_op = INDEX_op_not_i64;
-                    have_not = TCG_TARGET_HAS_not_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    not_op = INDEX_op_not_vec;
-                    have_not = TCG_TARGET_HAS_not_vec;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_not) {
-                    break;
-                }
-                op->opc = not_op;
-                reset_temp(op->args[0]);
-                op->args[1] = op->args[i];
-                continue;
-            }
         default:
             break;
         }
-- 
2.25.1

Even though there is only one user, place this more complex
conversion into its own helper.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 89 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+    /*
+     * Because of fold_sub_to_neg, we want to always return true,
+     * via finish_folding.
+     */
+    finish_folding(ctx, op);
+    return true;
 }
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
     return fold_const2(ctx, op);
 }
 
+static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
+{
+    TCGOpcode neg_op;
+    bool have_neg;
+
+    if (!arg_is_const(op->args[1]) || arg_info(op->args[1])->val != 0) {
+        return false;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        neg_op = INDEX_op_neg_i32;
+        have_neg = TCG_TARGET_HAS_neg_i32;
+        break;
+    case TCG_TYPE_I64:
+        neg_op = INDEX_op_neg_i64;
+        have_neg = TCG_TARGET_HAS_neg_i64;
+        break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        neg_op = INDEX_op_neg_vec;
+        have_neg = (TCG_TARGET_HAS_neg_vec &&
+                    tcg_can_emit_vec_op(neg_op, ctx->type, TCGOP_VECE(op)) > 0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    if (have_neg) {
+        op->opc = neg_op;
+        op->args[1] = op->args[2];
+        return fold_neg(ctx, op);
+    }
+    return false;
+}
+
 static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0)) {
+        fold_xx_to_i(ctx, op, 0) ||
+        fold_sub_to_neg(ctx, op)) {
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
                 continue;
             }
             break;
-        CASE_OP_32_64_VEC(sub):
-            {
-                TCGOpcode neg_op;
-                bool have_neg;
-
-                if (arg_is_const(op->args[2])) {
-                    /* Proceed with possible constant folding. */
-                    break;
-                }
-                switch (ctx.type) {
-                case TCG_TYPE_I32:
-                    neg_op = INDEX_op_neg_i32;
-                    have_neg = TCG_TARGET_HAS_neg_i32;
-                    break;
-                case TCG_TYPE_I64:
-                    neg_op = INDEX_op_neg_i64;
-                    have_neg = TCG_TARGET_HAS_neg_i64;
-                    break;
-                case TCG_TYPE_V64:
-                case TCG_TYPE_V128:
-                case TCG_TYPE_V256:
-                    neg_op = INDEX_op_neg_vec;
-                    have_neg = tcg_can_emit_vec_op(neg_op, ctx.type,
-                                                   TCGOP_VECE(op)) > 0;
-                    break;
-                default:
-                    g_assert_not_reached();
-                }
-                if (!have_neg) {
-                    break;
-                }
-                if (arg_is_const(op->args[1])
-                    && arg_info(op->args[1])->val == 0) {
-                    op->opc = neg_op;
-                    reset_temp(op->args[0]);
-                    op->args[1] = op->args[2];
-                    continue;
-                }
-            }
-            break;
         default:
             break;
         }
-- 
2.25.1

Pull the "op r, a, i => mov r, a" optimization into a function,
and use them in the outer-most logical operations.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 61 +++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_xi_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
     return false;
 }
 
+/* If the binary operation has second argument @i, fold to identity. */
+static bool fold_xi_to_x(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[2]) && arg_info(op->args[2])->val == i) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /* If the binary operation has second argument @i, fold to NOT. */
 static bool fold_xi_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 static bool fold_orc(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, -1) ||
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
-    return fold_const2(ctx, op);
+    if (fold_const2(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
 }
 
 static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
+        fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expression for "op r, a, const => mov r, a" cases */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(sub):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64_VEC(andc):
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == 0) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(orc):
-        CASE_OP_32_64(eqv):
-            if (!arg_is_const(op->args[1])
-                && arg_is_const(op->args[2])
-                && arg_info(op->args[2])->val == -1) {
-                tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Pull the "op r, 0, b => movi r, 0" optimization into a function,
and use it in fold_shift.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_to_not(OptContext *ctx, TCGOp *op, int idx)
     return false;
 }
 
+/* If the binary operation has first argument @i, fold to @i. */
+static bool fold_ix_to_i(OptContext *ctx, TCGOp *op, uint64_t i)
+{
+    if (arg_is_const(op->args[1]) && arg_info(op->args[1])->val == i) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], i);
+    }
+    return false;
+}
+
 /* If the binary operation has first argument @i, fold to NOT. */
 static bool fold_ix_to_not(OptContext *ctx, TCGOp *op, uint64_t i)
 {
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
+        fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify expressions for "shift/rot r, 0, a => movi r, 0",
-           and "sub r, 0, a => neg r, a" case.  */
-        switch (opc) {
-        CASE_OP_32_64(shl):
-        CASE_OP_32_64(shr):
-        CASE_OP_32_64(sar):
-        CASE_OP_32_64(rotl):
-        CASE_OP_32_64(rotr):
-            if (arg_is_const(op->args[1])
-                && arg_info(op->args[1])->val == 0) {
-                tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-                continue;
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Simplify using known-zero bits. Currently only ops with a single
            output argument is supported. */
         z_mask = -1;
-- 
2.25.1

Move all of the known-zero optimizations into the per-opcode
functions.  Use fold_masks when there is a possibility of the
result being determined, and simply set ctx->z_mask otherwise.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 545 ++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 251 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     TCGTempSet temps_used;
 
     /* In flight values from optimization. */
-    uint64_t z_mask;
+    uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
+    uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
     TCGType type;
 } OptContext;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_masks(OptContext *ctx, TCGOp *op)
+{
+    uint64_t a_mask = ctx->a_mask;
+    uint64_t z_mask = ctx->z_mask;
+
+    /*
+     * 32-bit ops generate 32-bit results.  For the result is zero test
+     * below, we can ignore high bits, but for further optimizations we
+     * need to record that the high bits contain garbage.
+     */
+    if (ctx->type == TCG_TYPE_I32) {
+        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
+        a_mask &= MAKE_64BIT_MASK(0, 32);
+        z_mask &= MAKE_64BIT_MASK(0, 32);
+    }
+
+    if (z_mask == 0) {
+        return tcg_opt_gen_movi(ctx, op, op->args[0], 0);
+    }
+    if (a_mask == 0) {
+        return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[1]);
+    }
+    return false;
+}
+
 /*
  * Convert @op to NOT, if NOT is supported by the host.
  * Return true f the conversion is successful, which will still
@@ -XXX,XX +XXX,XX @@ static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1, z2;
+
     if (fold_const2(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+    z2 = arg_info(op->args[2])->z_mask;
+    ctx->z_mask = z1 & z2;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer affected bits from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        ctx->a_mask = z1 & ~z2;
+    }
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_andc(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z1;
+
     if (fold_const2(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_ix_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    z1 = arg_info(op->args[1])->z_mask;
+
+    /*
+     * Known-zeros does not imply known-ones.  Therefore unless
+     * arg2 is constant, we can't infer anything from it.
+     */
+    if (arg_is_const(op->args[2])) {
+        uint64_t z2 = ~arg_info(op->args[2])->z_mask;
+        ctx->a_mask = z1 & ~z2;
+        z1 &= z2;
+    }
+    ctx->z_mask = z1;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask, sign;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
         t = do_constant_folding(op->opc, ctx->type, t, op->args[2]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask = arg_info(op->args[1])->z_mask;
+    switch (op->opc) {
+    case INDEX_op_bswap16_i32:
+    case INDEX_op_bswap16_i64:
+        z_mask = bswap16(z_mask);
+        sign = INT16_MIN;
+        break;
+    case INDEX_op_bswap32_i32:
+    case INDEX_op_bswap32_i64:
+        z_mask = bswap32(z_mask);
+        sign = INT32_MIN;
+        break;
+    case INDEX_op_bswap64_i64:
+        z_mask = bswap64(z_mask);
+        sign = INT64_MIN;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
+    case TCG_BSWAP_OZ:
+        break;
+    case TCG_BSWAP_OS:
+        /* If the sign bit may be 1, force all the bits above to 1. */
+        if (z_mask & sign) {
+            z_mask |= sign;
+        }
+        break;
+    default:
+        /* The high bits are undefined: force all bits above the sign to 1. */
+        z_mask |= sign << 1;
+        break;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_call(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_call(OptContext *ctx, TCGOp *op)
 
 static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_count_zeros(OptContext *ctx, TCGOp *op)
         }
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[2]);
     }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        z_mask = 31;
+        break;
+    case TCG_TYPE_I64:
+        z_mask = 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    ctx->z_mask = arg_info(op->args[2])->z_mask | z_mask;
+
     return false;
 }
 
 static bool fold_ctpop(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    switch (ctx->type) {
+    case TCG_TYPE_I32:
+        ctx->z_mask = 32 | 31;
+        break;
+    case TCG_TYPE_I64:
+        ctx->z_mask = 64 | 63;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
 }
 
 static bool fold_deposit(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_deposit(OptContext *ctx, TCGOp *op)
         t1 = deposit64(t1, op->args[3], op->args[4], t2);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t1);
     }
+
+    ctx->z_mask = deposit64(arg_info(op->args[1])->z_mask,
+                            op->args[3], op->args[4],
+                            arg_info(op->args[2])->z_mask);
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_extract(OptContext *ctx, TCGOp *op)
         t = extract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extract2(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask, sign;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8s):
+        sign = INT8_MIN;
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16s):
+        sign = INT16_MIN;
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_ext_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32s_i64:
+        sign = INT32_MIN;
+        z_mask = (uint32_t)z_mask;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (z_mask & sign) {
+        z_mask |= sign;
+    } else if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_extu(OptContext *ctx, TCGOp *op)
 {
-    return fold_const1(ctx, op);
+    uint64_t z_mask_old, z_mask;
+    bool type_change = false;
+
+    if (fold_const1(ctx, op)) {
+        return true;
+    }
+
+    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+
+    switch (op->opc) {
+    CASE_OP_32_64(ext8u):
+        z_mask = (uint8_t)z_mask;
+        break;
+    CASE_OP_32_64(ext16u):
+        z_mask = (uint16_t)z_mask;
+        break;
+    case INDEX_op_extrl_i64_i32:
+    case INDEX_op_extu_i32_i64:
+        type_change = true;
+        QEMU_FALLTHROUGH;
+    case INDEX_op_ext32u_i64:
+        z_mask = (uint32_t)z_mask;
+        break;
+    case INDEX_op_extrh_i64_i32:
+        type_change = true;
+        z_mask >>= 32;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    ctx->z_mask = z_mask;
+    if (!type_change) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    return fold_masks(ctx, op);
 }
 
 static bool fold_mb(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
 
+    ctx->z_mask = arg_info(op->args[3])->z_mask
+                | arg_info(op->args[4])->z_mask;
+
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
         uint64_t fv = arg_info(op->args[4])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
 
 static bool fold_neg(OptContext *ctx, TCGOp *op)
 {
+    uint64_t z_mask;
+
     if (fold_const1(ctx, op)) {
         return true;
     }
+
+    /* Set to 1 all bits to the left of the rightmost.  */
+    z_mask = arg_info(op->args[1])->z_mask;
+    ctx->z_mask = -(z_mask & -z_mask);
+
     /*
      * Because of fold_sub_to_neg, we want to always return true,
      * via finish_folding.
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
         fold_xx_to_x(ctx, op)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 static bool fold_orc(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
 
 static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
 {
+    const TCGOpDef *def = &tcg_op_defs[op->opc];
+    MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
+    MemOp mop = get_memop(oi);
+    int width = 8 * memop_size(mop);
+
+    if (!(mop & MO_SIGN) && width < 64) {
+        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    }
+
     /* Opcodes that touch guest memory stop the mb optimization.  */
     ctx->prev_mb = NULL;
     return false;
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
+
+    ctx->z_mask = 1;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
         op->opc = INDEX_op_setcond_i32;
         break;
     }
+
+    ctx->z_mask = 1;
     return false;
 
  do_setcond_const:
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
+    int64_t z_mask_old, z_mask;
+
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
         t = sextract64(t, op->args[2], op->args[3]);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
-    return false;
+
+    z_mask_old = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
+    if (op->args[2] == 0 && z_mask >= 0) {
+        ctx->a_mask = z_mask_old ^ z_mask;
+    }
+    ctx->z_mask = z_mask;
+
+    return fold_masks(ctx, op);
 }
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_shift(OptContext *ctx, TCGOp *op)
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
+
+    if (arg_is_const(op->args[2])) {
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
+                                          arg_info(op->args[1])->z_mask,
+                                          arg_info(op->args[2])->val);
+        return fold_masks(ctx, op);
+    }
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
     return fold_addsub2_i32(ctx, op, false);
 }
 
+static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
+{
+    /* We can't do any folding with a load, but we can record bits. */
+    switch (op->opc) {
+    CASE_OP_32_64(ld8u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        break;
+    CASE_OP_32_64(ld16u):
+        ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        break;
+    case INDEX_op_ld32u_i64:
+        ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return false;
+}
+
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
     if (fold_const2(ctx, op) ||
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
-    return false;
+
+    ctx->z_mask = arg_info(op->args[1])->z_mask
+                | arg_info(op->args[2])->z_mask;
+    return fold_masks(ctx, op);
 }
 
 /* Propagate constants and copies, fold constant expressions. */
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
     }
 
     QTAILQ_FOREACH_SAFE(op, &s->ops, link, op_next) {
-        uint64_t z_mask, partmask, affected, tmp;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def;
         bool done = false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             break;
         }
 
-        /* Simplify using known-zero bits. Currently only ops with a single
-           output argument is supported. */
-        z_mask = -1;
-        affected = -1;
-        switch (opc) {
-        CASE_OP_32_64(ext8s):
-            if ((arg_info(op->args[1])->z_mask & 0x80) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext8u):
-            z_mask = 0xff;
-            goto and_const;
-        CASE_OP_32_64(ext16s):
-            if ((arg_info(op->args[1])->z_mask & 0x8000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        CASE_OP_32_64(ext16u):
-            z_mask = 0xffff;
-            goto and_const;
-        case INDEX_op_ext32s_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_ext32u_i64:
-            z_mask = 0xffffffffU;
-            goto and_const;
-
-        CASE_OP_32_64(and):
-            z_mask = arg_info(op->args[2])->z_mask;
-            if (arg_is_const(op->args[2])) {
-        and_const:
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            z_mask = arg_info(op->args[1])->z_mask & z_mask;
-            break;
-
-        case INDEX_op_ext_i32_i64:
-            if ((arg_info(op->args[1])->z_mask & 0x80000000) != 0) {
-                break;
-            }
-            QEMU_FALLTHROUGH;
-        case INDEX_op_extu_i32_i64:
-            /* We do not compute affected as it is a size changing op.  */
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-
-        CASE_OP_32_64(andc):
-            /* Known-zeros does not imply known-ones.  Therefore unless
-               op->args[2] is constant, we can't infer anything from it.  */
-            if (arg_is_const(op->args[2])) {
-                z_mask = ~arg_info(op->args[2])->z_mask;
-                goto and_const;
-            }
-            /* But we certainly know nothing outside args[1] may be set. */
-            z_mask = arg_info(op->args[1])->z_mask;
-            break;
-
-        case INDEX_op_sar_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (int32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_sar_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (int64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_shr_i32:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 31;
-                z_mask = (uint32_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-        case INDEX_op_shr_i64:
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & 63;
-                z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> tmp;
-            }
-            break;
-
-        case INDEX_op_extrl_i64_i32:
-            z_mask = (uint32_t)arg_info(op->args[1])->z_mask;
-            break;
-        case INDEX_op_extrh_i64_i32:
-            z_mask = (uint64_t)arg_info(op->args[1])->z_mask >> 32;
-            break;
-
-        CASE_OP_32_64(shl):
-            if (arg_is_const(op->args[2])) {
-                tmp = arg_info(op->args[2])->val & (TCG_TARGET_REG_BITS - 1);
-                z_mask = arg_info(op->args[1])->z_mask << tmp;
-            }
-            break;
-
-        CASE_OP_32_64(neg):
-            /* Set to 1 all bits to the left of the rightmost.  */
-            z_mask = -(arg_info(op->args[1])->z_mask
-                       & -arg_info(op->args[1])->z_mask);
-            break;
-
-        CASE_OP_32_64(deposit):
-            z_mask = deposit64(arg_info(op->args[1])->z_mask,
-                               op->args[3], op->args[4],
-                               arg_info(op->args[2])->z_mask);
-            break;
-
-        CASE_OP_32_64(extract):
-            z_mask = extract64(arg_info(op->args[1])->z_mask,
-                               op->args[2], op->args[3]);
-            if (op->args[2] == 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-        CASE_OP_32_64(sextract):
-            z_mask = sextract64(arg_info(op->args[1])->z_mask,
-                                op->args[2], op->args[3]);
-            if (op->args[2] == 0 && (tcg_target_long)z_mask >= 0) {
-                affected = arg_info(op->args[1])->z_mask & ~z_mask;
-            }
-            break;
-
-        CASE_OP_32_64(or):
-        CASE_OP_32_64(xor):
-            z_mask = arg_info(op->args[1])->z_mask
-                   | arg_info(op->args[2])->z_mask;
-            break;
-
-        case INDEX_op_clz_i32:
-        case INDEX_op_ctz_i32:
-            z_mask = arg_info(op->args[2])->z_mask | 31;
-            break;
-
-        case INDEX_op_clz_i64:
-        case INDEX_op_ctz_i64:
-            z_mask = arg_info(op->args[2])->z_mask | 63;
-            break;
-
-        case INDEX_op_ctpop_i32:
-            z_mask = 32 | 31;
-            break;
-        case INDEX_op_ctpop_i64:
-            z_mask = 64 | 63;
-            break;
-
-        CASE_OP_32_64(setcond):
-        case INDEX_op_setcond2_i32:
-            z_mask = 1;
-            break;
-
-        CASE_OP_32_64(movcond):
-            z_mask = arg_info(op->args[3])->z_mask
-                   | arg_info(op->args[4])->z_mask;
-            break;
-
-        CASE_OP_32_64(ld8u):
-            z_mask = 0xff;
-            break;
-        CASE_OP_32_64(ld16u):
-            z_mask = 0xffff;
-            break;
-        case INDEX_op_ld32u_i64:
-            z_mask = 0xffffffffu;
-            break;
-
-        CASE_OP_32_64(qemu_ld):
-            {
-                MemOpIdx oi = op->args[def->nb_oargs + def->nb_iargs];
-                MemOp mop = get_memop(oi);
-                if (!(mop & MO_SIGN)) {
-                    z_mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1;
-                }
-            }
-            break;
-
-        CASE_OP_32_64(bswap16):
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffff) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap16(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int16_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(16, 48);
-                break;
-            }
-            break;
-
-        case INDEX_op_bswap32_i64:
-            z_mask = arg_info(op->args[1])->z_mask;
-            if (z_mask <= 0xffffffffu) {
-                op->args[2] |= TCG_BSWAP_IZ;
-            }
-            z_mask = bswap32(z_mask);
-            switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
-            case TCG_BSWAP_OZ:
-                break;
-            case TCG_BSWAP_OS:
-                z_mask = (int32_t)z_mask;
-                break;
-            default: /* undefined high bits */
-                z_mask |= MAKE_64BIT_MASK(32, 32);
-                break;
-            }
-            break;
-
-        default:
-            break;
-        }
-
-        /* 32-bit ops generate 32-bit results.  For the result is zero test
-           below, we can ignore high bits, but for further optimizations we
-           need to record that the high bits contain garbage.  */
-        partmask = z_mask;
-        if (ctx.type == TCG_TYPE_I32) {
-            z_mask |= ~(tcg_target_ulong)0xffffffffu;
-            partmask &= 0xffffffffu;
-            affected &= 0xffffffffu;
-        }
-        ctx.z_mask = z_mask;
-
-        if (partmask == 0) {
-            tcg_opt_gen_movi(&ctx, op, op->args[0], 0);
-            continue;
-        }
-        if (affected == 0) {
-            tcg_opt_gen_mov(&ctx, op, op->args[0], op->args[1]);
-            continue;
-        }
+        /* Assume all bits affected, and no bits known zero. */
+        ctx.a_mask = -1;
+        ctx.z_mask = -1;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32u_i64:
+            done = fold_tcg_ld(&ctx, op);
+            break;
         case INDEX_op_mb:
             done = fold_mb(&ctx, op);
             break;
-- 
2.25.1

Rename to fold_multiply2, and handle muls2_i32, mulu2_i64,
and muls2_i64.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_mulu2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
-        uint32_t a = arg_info(op->args[2])->val;
-        uint32_t b = arg_info(op->args[3])->val;
-        uint64_t r = (uint64_t)a * b;
+        uint64_t a = arg_info(op->args[2])->val;
+        uint64_t b = arg_info(op->args[3])->val;
+        uint64_t h, l;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
+
+        switch (op->opc) {
+        case INDEX_op_mulu2_i32:
+            l = (uint64_t)(uint32_t)a * (uint32_t)b;
+            h = (int32_t)(l >> 32);
+            l = (int32_t)l;
+            break;
+        case INDEX_op_muls2_i32:
+            l = (int64_t)(int32_t)a * (int32_t)b;
+            h = l >> 32;
+            l = (int32_t)l;
+            break;
+        case INDEX_op_mulu2_i64:
+            mulu64(&l, &h, a, b);
+            break;
+        case INDEX_op_muls2_i64:
+            muls64(&l, &h, a, b);
+            break;
+        default:
+            g_assert_not_reached();
+        }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)r);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(r >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, l);
+        tcg_opt_gen_movi(ctx, op2, rh, h);
         return true;
     }
     return false;
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(muluh):
             done = fold_mul_highpart(&ctx, op);
             break;
-        case INDEX_op_mulu2_i32:
-            done = fold_mulu2_i32(&ctx, op);
+        CASE_OP_32_64(muls2):
+        CASE_OP_32_64(mulu2):
+            done = fold_multiply2(&ctx, op);
             break;
         CASE_OP_32_64(nand):
             done = fold_nand(&ctx, op);
-- 
2.25.1

Rename to fold_addsub2.
Use Int128 to implement the wider operation.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/int128.h"
 #include "tcg/tcg-op.h"
 #include "tcg-internal.h"
 
@@ -XXX,XX +XXX,XX @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_addsub2_i32(OptContext *ctx, TCGOp *op, bool add)
+static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
         arg_is_const(op->args[4]) && arg_is_const(op->args[5])) {
-        uint32_t al = arg_info(op->args[2])->val;
-        uint32_t ah = arg_info(op->args[3])->val;
-        uint32_t bl = arg_info(op->args[4])->val;
-        uint32_t bh = arg_info(op->args[5])->val;
-        uint64_t a = ((uint64_t)ah << 32) | al;
-        uint64_t b = ((uint64_t)bh << 32) | bl;
+        uint64_t al = arg_info(op->args[2])->val;
+        uint64_t ah = arg_info(op->args[3])->val;
+        uint64_t bl = arg_info(op->args[4])->val;
+        uint64_t bh = arg_info(op->args[5])->val;
         TCGArg rl, rh;
-        TCGOp *op2 = tcg_op_insert_before(ctx->tcg, op, INDEX_op_mov_i32);
+        TCGOp *op2;
 
-        if (add) {
-            a += b;
+        if (ctx->type == TCG_TYPE_I32) {
+            uint64_t a = deposit64(al, 32, 32, ah);
+            uint64_t b = deposit64(bl, 32, 32, bh);
+
+            if (add) {
+                a += b;
+            } else {
+                a -= b;
+            }
+
+            al = sextract64(a, 0, 32);
+            ah = sextract64(a, 32, 32);
         } else {
-            a -= b;
+            Int128 a = int128_make128(al, ah);
+            Int128 b = int128_make128(bl, bh);
+
+            if (add) {
+                a = int128_add(a, b);
+            } else {
+                a = int128_sub(a, b);
+            }
+
+            al = int128_getlo(a);
+            ah = int128_gethi(a);
         }
 
         rl = op->args[0];
         rh = op->args[1];
-        tcg_opt_gen_movi(ctx, op, rl, (int32_t)a);
-        tcg_opt_gen_movi(ctx, op2, rh, (int32_t)(a >> 32));
+
+        /* The proper opcode is supplied by tcg_opt_gen_mov. */
+        op2 = tcg_op_insert_before(ctx->tcg, op, 0);
+
+        tcg_opt_gen_movi(ctx, op, rl, al);
+        tcg_opt_gen_movi(ctx, op2, rh, ah);
         return true;
     }
     return false;
 }
 
-static bool fold_add2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, true);
+    return fold_addsub2(ctx, op, true);
 }
 
 static bool fold_and(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub2_i32(OptContext *ctx, TCGOp *op)
+static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
-    return fold_addsub2_i32(ctx, op, false);
+    return fold_addsub2(ctx, op, false);
 }
 
 static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(add):
             done = fold_add(&ctx, op);
             break;
-        case INDEX_op_add2_i32:
-            done = fold_add2_i32(&ctx, op);
+        CASE_OP_32_64(add2):
+            done = fold_add2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(and):
             done = fold_and(&ctx, op);
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64_VEC(sub):
             done = fold_sub(&ctx, op);
             break;
-        case INDEX_op_sub2_i32:
-            done = fold_sub2_i32(&ctx, op);
+        CASE_OP_32_64(sub2):
+            done = fold_sub2(&ctx, op);
             break;
         CASE_OP_32_64_VEC(xor):
             done = fold_xor(&ctx, op);
-- 
2.25.1

Most of these are handled by creating a fold_const2_commutative
to handle all of the binary operators.  The rest were already
handled on a case-by-case basis in the switch, and have their
own fold function in which to place the call.

We now have only one major switch on TCGOpcode.

Introduce NO_DEST and a block comment for swap_commutative in
order to make the handling of brcond and movcond opcodes cleaner.

Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 142 ++++++++++++++++++++++++-------------------------
 1 file changed, 70 insertions(+), 72 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static int do_constant_folding_cond2(TCGArg *p1, TCGArg *p2, TCGCond c)
     return -1;
 }
 
+/**
+ * swap_commutative:
+ * @dest: TCGArg of the destination argument, or NO_DEST.
+ * @p1: first paired argument
+ * @p2: second paired argument
+ *
+ * If *@p1 is a constant and *@p2 is not, swap.
+ * If *@p2 matches @dest, swap.
+ * Return true if a swap was performed.
+ */
+
+#define NO_DEST  temp_arg(NULL)
+
 static bool swap_commutative(TCGArg dest, TCGArg *p1, TCGArg *p2)
 {
     TCGArg a1 = *p1, a2 = *p2;
@@ -XXX,XX +XXX,XX @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return fold_const2(ctx, op);
+}
+
 static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
@@ -XXX,XX +XXX,XX @@ static bool fold_xx_to_x(OptContext *ctx, TCGOp *op)
 
 static bool fold_add(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 
 static bool fold_add2(OptContext *ctx, TCGOp *op)
 {
+    /* Note that the high and low parts may be independently swapped. */
+    swap_commutative(op->args[0], &op->args[2], &op->args[4]);
+    swap_commutative(op->args[1], &op->args[3], &op->args[5]);
+
     return fold_addsub2(ctx, op, true);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
 {
     uint64_t z1, z2;
 
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xx_to_x(ctx, op)) {
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
 static bool fold_brcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[2];
-    int i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[0], &op->args[1])) {
+        op->args[2] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[0], op->args[1], cond);
     if (i == 0) {
         tcg_op_remove(ctx->tcg, op);
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond(OptContext *ctx, TCGOp *op)
 static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[4];
-    int i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     TCGArg label = op->args[5];
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[0], &op->args[2])) {
+        op->args[4] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[0], &op->args[2], cond);
     if (i >= 0) {
         goto do_brcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_dup2(OptContext *ctx, TCGOp *op)
 
 static bool fold_eqv(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, -1) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_mov(OptContext *ctx, TCGOp *op)
 static bool fold_movcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(NO_DEST, &op->args[1], &op->args[2])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+    /*
+     * Canonicalize the "false" input reg to match the destination reg so
+     * that the tcg backend can implement a "move if true" operation.
+     */
+    if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
+        op->args[5] = cond = tcg_invert_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_mov(ctx, op, op->args[0], op->args[4 - i]);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul(OptContext *ctx, TCGOp *op)
 
 static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_i(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_mul_highpart(OptContext *ctx, TCGOp *op)
 
 static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 {
+    swap_commutative(op->args[0], &op->args[2], &op->args[3]);
+
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3])) {
         uint64_t a = arg_info(op->args[2])->val;
         uint64_t b = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_multiply2(OptContext *ctx, TCGOp *op)
 
 static bool fold_nand(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_neg(OptContext *ctx, TCGOp *op)
 
 static bool fold_nor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
 
 static bool fold_or(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xx_to_x(ctx, op)) {
         return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_remainder(OptContext *ctx, TCGOp *op)
 static bool fold_setcond(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[3];
-    int i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
+    int i;
 
+    if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
+        op->args[3] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond(ctx->type, op->args[1], op->args[2], cond);
     if (i >= 0) {
         return tcg_opt_gen_movi(ctx, op, op->args[0], i);
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond(OptContext *ctx, TCGOp *op)
 static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 {
     TCGCond cond = op->args[5];
-    int i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
-    int inv = 0;
+    int i, inv = 0;
 
+    if (swap_commutative2(&op->args[1], &op->args[3])) {
+        op->args[5] = cond = tcg_swap_cond(cond);
+    }
+
+    i = do_constant_folding_cond2(&op->args[1], &op->args[3], cond);
     if (i >= 0) {
         goto do_setcond_const;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 
 static bool fold_xor(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
+    if (fold_const2_commutative(ctx, op) ||
         fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_xi_to_not(ctx, op, -1)) {
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* For commutative operations make constant second argument */
-        switch (opc) {
-        CASE_OP_32_64_VEC(add):
-        CASE_OP_32_64_VEC(mul):
-        CASE_OP_32_64_VEC(and):
-        CASE_OP_32_64_VEC(or):
-        CASE_OP_32_64_VEC(xor):
-        CASE_OP_32_64(eqv):
-        CASE_OP_32_64(nand):
-        CASE_OP_32_64(nor):
-        CASE_OP_32_64(muluh):
-        CASE_OP_32_64(mulsh):
-            swap_commutative(op->args[0], &op->args[1], &op->args[2]);
-            break;
-        CASE_OP_32_64(brcond):
-            if (swap_commutative(-1, &op->args[0], &op->args[1])) {
-                op->args[2] = tcg_swap_cond(op->args[2]);
-            }
-            break;
-        CASE_OP_32_64(setcond):
-            if (swap_commutative(op->args[0], &op->args[1], &op->args[2])) {
-                op->args[3] = tcg_swap_cond(op->args[3]);
-            }
-            break;
-        CASE_OP_32_64(movcond):
-            if (swap_commutative(-1, &op->args[1], &op->args[2])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            /* For movcond, we canonicalize the "false" input reg to match
-               the destination reg so that the tcg backend can implement
-               a "move if true" operation.  */
-            if (swap_commutative(op->args[0], &op->args[4], &op->args[3])) {
-                op->args[5] = tcg_invert_cond(op->args[5]);
-            }
-            break;
-        CASE_OP_32_64(add2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[4]);
-            swap_commutative(op->args[1], &op->args[3], &op->args[5]);
-            break;
-        CASE_OP_32_64(mulu2):
-        CASE_OP_32_64(muls2):
-            swap_commutative(op->args[0], &op->args[2], &op->args[3]);
-            break;
-        case INDEX_op_brcond2_i32:
-            if (swap_commutative2(&op->args[0], &op->args[2])) {
-                op->args[4] = tcg_swap_cond(op->args[4]);
-            }
-            break;
-        case INDEX_op_setcond2_i32:
-            if (swap_commutative2(&op->args[1], &op->args[3])) {
-                op->args[5] = tcg_swap_cond(op->args[5]);
-            }
-            break;
-        default:
-            break;
-        }
-
         /* Assume all bits affected, and no bits known zero. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
-- 
2.25.1

This "garbage" setting pre-dates the addition of the type
changing opcodes INDEX_op_ext_i32_i64, INDEX_op_extu_i32_i64,
and INDEX_op_extr{l,h}_i64_i32.

So now we have a definitive points at which to adjust z_mask
to eliminate such bits from the 32-bit operands.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
-        if (TCG_TARGET_REG_BITS > 32 && ts->type == TCG_TYPE_I32) {
-            /* High bits of a 32-bit quantity are garbage.  */
-            ti->z_mask |= ~0xffffffffull;
-        }
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     TCGTemp *src_ts = arg_temp(src);
     TempOptInfo *di;
     TempOptInfo *si;
-    uint64_t z_mask;
     TCGOpcode new_op;
 
     if (ts_are_copies(dst_ts, src_ts)) {
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[0] = dst;
     op->args[1] = src;
 
-    z_mask = si->z_mask;
-    if (TCG_TARGET_REG_BITS > 32 && new_op == INDEX_op_mov_i32) {
-        /* High bits of the destination are now garbage.  */
-        z_mask |= ~0xffffffffull;
-    }
-    di->z_mask = z_mask;
+    di->z_mask = si->z_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
 static bool tcg_opt_gen_movi(OptContext *ctx, TCGOp *op,
                              TCGArg dst, uint64_t val)
 {
-    /* Convert movi to mov with constant temp. */
-    TCGTemp *tv = tcg_constant_internal(ctx->type, val);
+    TCGTemp *tv;
 
+    if (ctx->type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Convert movi to mov with constant temp. */
+    tv = tcg_constant_internal(ctx->type, val);
     init_ts_info(ctx, tv);
     return tcg_opt_gen_mov(ctx, op, dst, temp_arg(tv));
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     uint64_t z_mask = ctx->z_mask;
 
     /*
-     * 32-bit ops generate 32-bit results.  For the result is zero test
-     * below, we can ignore high bits, but for further optimizations we
-     * need to record that the high bits contain garbage.
+     * 32-bit ops generate 32-bit results, which for the purpose of
+     * simplifying tcg are sign-extended.  Certainly that's how we
+     * represent our constants elsewhere.  Note that the bits will
+     * be reset properly for a 64-bit value when encountering the
+     * type changing opcodes.
      */
     if (ctx->type == TCG_TYPE_I32) {
-        ctx->z_mask |= MAKE_64BIT_MASK(32, 32);
-        a_mask &= MAKE_64BIT_MASK(0, 32);
-        z_mask &= MAKE_64BIT_MASK(0, 32);
+        a_mask = (int32_t)a_mask;
+        z_mask = (int32_t)z_mask;
+        ctx->z_mask = z_mask;
     }
 
     if (z_mask == 0) {
-- 
2.25.1

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -XXX,XX +XXX,XX @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -XXX,XX +XXX,XX @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -XXX,XX +XXX,XX @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -XXX,XX +XXX,XX @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -XXX,XX +XXX,XX @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -XXX,XX +XXX,XX @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -XXX,XX +XXX,XX @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -XXX,XX +XXX,XX @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -XXX,XX +XXX,XX @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -XXX,XX +XXX,XX @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -XXX,XX +XXX,XX @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -XXX,XX +XXX,XX @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1

Sign repetitions are perforce all identical, whether they are 1 or 0.
Bitwise operations preserve the relative quantity of the repetitions.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static bool fold_and(OptContext *ctx, TCGOp *op)
     z2 = arg_info(op->args[2])->z_mask;
     ctx->z_mask = z1 & z2;
 
+    /*
+     * Sign repetitions are perforce all identical, whether they are 1 or 0.
+     * Bitwise operations preserve the relative quantity of the repetitions.
+     */
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
+
     /*
      * Known-zeros does not imply known-ones.  Therefore unless
      * arg2 is constant, we can't infer affected bits from it.
@@ -XXX,XX +XXX,XX @@ static bool fold_andc(OptContext *ctx, TCGOp *op)
     }
     ctx->z_mask = z1;
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_movcond(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[3])->z_mask
                 | arg_info(op->args[4])->z_mask;
+    ctx->s_mask = arg_info(op->args[3])->s_mask
+                & arg_info(op->args[4])->s_mask;
 
     if (arg_is_const(op->args[3]) && arg_is_const(op->args[4])) {
         uint64_t tv = arg_info(op->args[3])->val;
@@ -XXX,XX +XXX,XX @@ static bool fold_nand(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, -1)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_nor(OptContext *ctx, TCGOp *op)
         fold_xi_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_not(OptContext *ctx, TCGOp *op)
         return true;
     }
 
+    ctx->s_mask = arg_info(op->args[1])->s_mask;
+
     /* Because of fold_to_not, we want to always return true, via finish. */
     finish_folding(ctx, op);
     return true;
@@ -XXX,XX +XXX,XX @@ static bool fold_or(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_orc(OptContext *ctx, TCGOp *op)
         fold_ix_to_not(ctx, op, 0)) {
         return true;
     }
+
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return false;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool fold_xor(OptContext *ctx, TCGOp *op)
 
     ctx->z_mask = arg_info(op->args[1])->z_mask
                 | arg_info(op->args[2])->z_mask;
+    ctx->s_mask = arg_info(op->args[1])->s_mask
+                & arg_info(op->args[2])->s_mask;
     return fold_masks(ctx, op);
 }
 
-- 
2.25.1

For constant shifts, we can simply shift the s_mask.

For variable shifts, we know that sar does not reduce
the s_mask, which helps for sequences like

ext32s_i64  t, in
    sar_i64     t, t, v
    ext32s_i64  out, t

allowing the final extend to be eliminated.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index XXXXXXX..XXXXXXX 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -XXX,XX +XXX,XX @@ static uint64_t smask_from_zmask(uint64_t zmask)
     return ~(~0ull >> rep);
 }
 
+/*
+ * Recreate a properly left-aligned smask after manipulation.
+ * Some bit-shuffling, particularly shifts and rotates, may
+ * retain sign bits on the left, but may scatter disconnected
+ * sign bits on the right.  Retain only what remains to the left.
+ */
+static uint64_t smask_from_smask(int64_t smask)
+{
+    /* Only the 1 bits are significant for smask */
+    return smask_from_zmask(~smask);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -XXX,XX +XXX,XX @@ static bool fold_sextract(OptContext *ctx, TCGOp *op)
 
 static bool fold_shift(OptContext *ctx, TCGOp *op)
 {
+    uint64_t s_mask, z_mask, sign;
+
     if (fold_const2(ctx, op) ||
         fold_ix_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0)) {
         return true;
     }
 
+    s_mask = arg_info(op->args[1])->s_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+
     if (arg_is_const(op->args[2])) {
-        ctx->z_mask = do_constant_folding(op->opc, ctx->type,
-                                          arg_info(op->args[1])->z_mask,
-                                          arg_info(op->args[2])->val);
+        int sh = arg_info(op->args[2])->val;
+
+        ctx->z_mask = do_constant_folding(op->opc, ctx->type, z_mask, sh);
+
+        s_mask = do_constant_folding(op->opc, ctx->type, s_mask, sh);
+        ctx->s_mask = smask_from_smask(s_mask);
+
         return fold_masks(ctx, op);
     }
+
+    switch (op->opc) {
+    CASE_OP_32_64(sar):
+        /*
+         * Arithmetic right shift will not reduce the number of
+         * input sign repetitions.
+         */
+        ctx->s_mask = s_mask;
+        break;
+    CASE_OP_32_64(shr):
+        /*
+         * If the sign bit is known zero, then logical right shift
+         * will not reduced the number of input sign repetitions.
+         */
+        sign = (s_mask & -s_mask) >> 1;
+        if (!(z_mask & sign)) {
+            ctx->s_mask = s_mask;
+        }
+        break;
+    default:
+        break;
+    }
+
     return false;
 }
 
-- 
2.25.1